In [1]:
import pandas as pd
import re

comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0, encoding='ISO-8859-1')
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))

# transform into lowercase
comments['comment'] = comments['comment'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# remove punctuations
comments['comment'] = comments['comment'].str.replace('[^\w\s]','')

# remove emoji, references: https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)
comments['comment'] = comments['comment'].apply(lambda x: remove_emoji(x))

# remove whitespace
comments['comment'] = comments['comment'].apply(lambda x: x.strip())

In [2]:
annotations = pd.read_csv('attack_annotations.tsv',  sep = '\t')
train_test = comments.loc[comments['split'].isin(['train', 'test'])]

labels = annotations.groupby('rev_id')['attack'].mean() >= 0.25
train_test['attack'] = labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_test['attack'] = labels


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

X = train_test['comment'].head(1000)
y = train_test['attack'].head(1000)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

clf = Pipeline([
    ('vect', CountVectorizer(max_features = 10000, ngram_range = (1,2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('forest', RandomForestClassifier(bootstrap = True,
                                      max_depth = 2,
                                      max_features = 'auto',
                                      min_samples_leaf = 1,
                                      min_samples_split = 2,
                                      n_estimators = 10)),
])

clf = clf.fit(X_train, y_train)

In [4]:
test_comments = train_test.query("split=='test'")

y_pred = clf.predict(test_comments['comment'])

In [5]:
met = metrics.classification_report(test_comments['attack'], y_pred)

accuracy = metrics.accuracy_score(test_comments['attack'], y_pred)

print("Accuracy_score: ", accuracy)
print(met)

Accuracy_score:  0.7780222624902925
              precision    recall  f1-score   support

       False       0.78      1.00      0.88     18033
        True       0.00      0.00      0.00      5145

    accuracy                           0.78     23178
   macro avg       0.39      0.50      0.44     23178
weighted avg       0.61      0.78      0.68     23178



  _warn_prf(average, modifier, msg_start, len(result))
