In [1]:
import pandas as pd
import re

comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0, encoding='ISO-8859-1')
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))

# transform into lowercase
comments['comment'] = comments['comment'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# remove punctuations
comments['comment'] = comments['comment'].str.replace('[^\w\s]','')

# remove emoji, references: https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)
comments['comment'] = comments['comment'].apply(lambda x: remove_emoji(x))

# remove whitespace
comments['comment'] = comments['comment'].apply(lambda x: x.strip())

In [2]:
comments.head()

Unnamed: 0_level_0,comment,year,logged_in,ns,sample,split
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
37675,this is not creative those are the dictionary ...,2002,False,article,random,train
44816,the term standard model is itself less npov th...,2002,False,article,random,train
49851,true or false the situation as of march 2002 w...,2002,False,article,random,train
89320,next maybe you could work on being less condes...,2002,True,article,random,dev
93890,this page will need disambiguation,2002,True,article,random,train


In [3]:
annotations = pd.read_csv('attack_annotations.tsv',  sep = '\t')
train_test = comments.loc[comments['split'].isin(['train', 'test'])]

labels = annotations.groupby('rev_id')['attack'].mean() >= 0.25
train_test['attack'] = labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_test['attack'] = labels


In [4]:
train_test.isnull().sum()

comment      0
year         0
logged_in    0
ns           0
sample       0
split        0
attack       0
dtype: int64

In [5]:
comments.isnull().sum()

comment      0
year         0
logged_in    0
ns           0
sample       0
split        0
dtype: int64

In [6]:
labels.head()

rev_id
37675    False
44816    False
49851    False
89320     True
93890    False
Name: attack, dtype: bool

### Before fitting, encoding categorical integer features using a one-hot, or one-of-K scheme


In [7]:
# limit to categorical data using df.select_dtypes()
train_test = train_test.select_dtypes(include=[object])

# check original shape
train_test.shape

(92704, 4)

In [8]:
train_test.head(3)

Unnamed: 0_level_0,comment,ns,sample,split
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
37675,this is not creative those are the dictionary ...,article,random,train
44816,the term standard model is itself less npov th...,article,random,train
49851,true or false the situation as of march 2002 w...,article,random,train


In [9]:
# import preprocessing from sklearn
from sklearn import preprocessing
# view columns using df.columns
train_test.columns

Index(['comment', 'ns', 'sample', 'split'], dtype='object')

In [10]:
# encode labels with value between 0 and n_classes-1.
le = preprocessing.LabelEncoder()
# use df.apply() to apply le.fit_transform to all columns
train_test = train_test.apply(le.fit_transform)
train_test.head()

Unnamed: 0_level_0,comment,ns,sample,split
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
37675,76087,0,1,1
44816,73899,0,1,1
49851,77923,0,1,1
93890,76501,0,1,1
102817,38847,1,1,1


In [11]:
# create a OneHotEncoder object, and fit it to all of X
enc = preprocessing.OneHotEncoder()
enc.fit(train_test)


OneHotEncoder()

In [12]:
onehotlabels = enc.transform(train_test).toarray()
onehotlabels.shape

(92704, 91981)

In [13]:
type(onehotlabels)

numpy.ndarray

In [14]:
onehotlabels.shape

(92704, 91981)

In [15]:
onehotlabels[:, :-1]

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [16]:
onehotlabels[:, -1]

array([1., 1., 1., ..., 0., 1., 1.])

In [17]:
onehotlabels[0:1000, :-1].shape

(1000, 91980)

In [18]:
onehotlabels[0:1000, -1].shape

(1000,)

In [19]:
# get target data
X = onehotlabels[0:1000, :-1]
y = onehotlabels[0:1000, -1]
# X = train_test.drop(['attack'], axis=1)
# y = train_test['attack']
print(f'X: {X.shape}')

X: (1000, 91980)


In [20]:
# divide data into train and test according to the 'split' attribute

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

In [21]:
print(f'X_train: {X_train.shape}')
print(f'y_train: {y_train.shape}')
print(f'X_test: {X_test.shape}')
print(f'y_test: {y_test.shape}')

X_train: (750, 91980)
y_train: (750,)
X_test: (250, 91980)
y_test: (250,)


In [22]:
# build random forest model with hyperparameters
import numpy as np

# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 10)]
# number of features to consider at every split
max_features = ['auto', 'sqrt']
# maximum number of levels in tree
max_depth = [2, 4]
# minimum number of samples required to split a node
min_samples_split = [2, 5]
# minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# method of selecting samples for training each tree
bootstrap = [True, False]

In [23]:
# create the param grid
param_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'bootstrap': bootstrap}
print(param_grid)

{'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72, 80], 'max_features': ['auto', 'sqrt'], 'max_depth': [2, 4], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'bootstrap': [True, False]}


In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

forest_model = RandomForestClassifier()
gridsearch = GridSearchCV(estimator = forest_model, param_grid = param_grid, cv = 3, verbose = 2, n_jobs = 4 )

In [25]:
type(X_train)

numpy.ndarray

In [26]:
type(y_train)

numpy.ndarray

In [27]:
gridsearch.fit(X_train, y_train)

Fitting 3 folds for each of 320 candidates, totalling 960 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   14.5s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:   47.5s
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed:  3.3min
[Parallel(n_jobs=4)]: Done 960 out of 960 | elapsed:  5.2min finished


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=4,
             param_grid={'bootstrap': [True, False], 'max_depth': [2, 4],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5],
                         'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72,
                                          80]},
             verbose=2)

In [28]:
gridsearch.best_params_

{'bootstrap': True,
 'max_depth': 2,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 10}

In [29]:
print(f'Train Accuracy - : {gridsearch.score(X_train, y_train):.3f}')
print(f'Test Accuracy - : {gridsearch.score(X_test, y_test):.3f}')

Train Accuracy - : 0.725
Test Accuracy - : 0.736
