In [1]:
!pip install transformers



In [2]:
%matplotlib inline

import re
import warnings

import joblib
import numpy as np
import nltk
import pandas as pd

from gensim.models.keyedvectors import KeyedVectors
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from transformers import BertTokenizer

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
warnings.filterwarnings('ignore')

COLOR_G = '\033[1m\033[92m\033[4m'
COLOR_R = '\033[1m\033[31m\033[4m'
END = '\033[0m'
RANDOM_STATE = 13
SAMPLE_SIZE = 10
SIZE = 300
STOP_WORDS = stopwords.words('english')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
data = pd.read_csv('/content/drive/MyDrive/lsml_final_task/toxic_comments.csv')
data.head(3)

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [6]:
data['toxic'].value_counts()

0    143346
1     16225
Name: toxic, dtype: int64

In [7]:
def get_clean_text(text):
    clean_text = nltk.word_tokenize(re.sub(r'[^a-zA-Z]', ' ', str(text)).lower())
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(word) for word in clean_text if not word in STOP_WORDS])

In [8]:
data['clean_text'] = data['text'].apply(get_clean_text)

In [9]:
data.head(3)

Unnamed: 0,text,toxic,clean_text
0,Explanation\nWhy the edits made under my usern...,0,explanation edits made username hardcore metal...
1,D'aww! He matches this background colour I'm s...,0,aww match background colour seemingly stuck th...
2,"Hey man, I'm really not trying to edit war. It...",0,hey man really trying edit war guy constantly ...


In [10]:
len_text = data['clean_text'].apply(lambda x: len(x.split()))

In [11]:
len_text.describe()

count    159571.000000
mean         34.687963
std          54.062811
min           0.000000
25%           9.000000
50%          18.000000
75%          38.000000
max        1250.000000
Name: clean_text, dtype: float64

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [13]:
def get_samples_and_targets_bert(data):
    def _get_vectors(line, model=tokenizer, size=SIZE):
        tokens = tokenizer.convert_tokens_to_ids(
            ['[CLS]'] + tokenizer.tokenize(line)[:(size - 2)] + ['[SEP]']
            )
        if len(tokens) < size:
            tokens.extend([0 for _ in range(size - len(tokens))])
        return np.array(tokens)

    features = []
    target = []
    for _, row in data.iterrows():
        features.append(_get_vectors(row['clean_text']))
        target.append(row['toxic'])
    features = pd.DataFrame(features).fillna(0)
    
    return features, target

In [14]:
features, target = get_samples_and_targets_bert(data)
display(features.head(3))
target[:3]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,101,7526,10086,2015,2081,5310,18442,13076,12392,2050,...,0,0,0,0,0,0,0,0,0,0
1,101,22091,2860,2674,4281,6120,9428,5881,4283,2831,...,0,0,0,0,0,0,0,0,0,0
2,101,4931,2158,2428,2667,10086,2162,3124,7887,9268,...,0,0,0,0,0,0,0,0,0,0


[0, 0, 0]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=RANDOM_STATE, stratify=target)

### RandomForestClassifier

In [16]:
model = RandomForestClassifier(class_weight='balanced', random_state=RANDOM_STATE)
param_grid = {
    'n_estimators' : (100, 120),
    'max_depth' : (None, 30), 
    'min_samples_split': (2, 3),
    'min_samples_leaf': (1, 3)
}
gs = GridSearchCV(model, param_grid=param_grid, scoring='accuracy', n_jobs=-1)
gs.fit(X_train, y_train)
# Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 100}
print(f'Best parameters: {gs.best_params_}') 
# Accuracy score: 0.9012793351956063
print(f'Accuracy score: {gs.best_score_}')

Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 120}
Accuracy score: 0.9011987610300352


In [17]:
prediction = gs.best_estimator_.predict(X_test)
acc = accuracy_score(y_test, prediction)
# Accuracy score: 0.9016753008021391
print(f'Accuracy score: {acc}')

Accuracy score: 0.9016753008021391


Training on full dataset:

In [18]:
model = RandomForestClassifier(
    random_state=RANDOM_STATE, 
    class_weight='balanced',
    **gs.best_params_
    )
model.fit(features, target)

RandomForestClassifier(class_weight='balanced', min_samples_split=3,
                       n_estimators=120, random_state=13)

In [19]:
joblib.dump(model, '/content/drive/MyDrive/lsml_final_task/model_rfc_lsml2.joblib')

['/content/drive/MyDrive/lsml_final_task/model_rfc_lsml2.joblib']

In [20]:
model_loaded = joblib.load('/content/drive/MyDrive/lsml_final_task/model_rfc_lsml2.joblib')

In [21]:
def get_test_text_preparation(text, size=SIZE):
    clean_text = ' '.join(nltk.word_tokenize(re.sub(r'[^a-zA-Z]', ' ', str(text)).lower()))
    # # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokens = tokenizer.convert_tokens_to_ids(
            ['[CLS]'] + tokenizer.tokenize(clean_text)[:(size - 2)] + ['[SEP]']
            )
    if len(tokens) < size:
        tokens.extend([0 for _ in range(size - len(tokens))])
    return pd.DataFrame([tokens]).fillna(0)


In [22]:
test_comment_1 = 'I love you'
X_test_1 = get_test_text_preparation(test_comment_1)
display(X_test_1)
test_comment_2 = 'Go fuck yourself'
X_test_2 = get_test_text_preparation(test_comment_2)
display(X_test_2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,101,1045,2293,2017,102,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,101,2175,6616,4426,102,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
prediction_1 = model_loaded.predict(X_test_1)
prediction_1

array([0])

In [24]:
prediction_2 = model_loaded.predict(X_test_2)
prediction_2

array([1])