In [141]:
import pandas as pd
import re

# download_file(ANNOTATED_COMMENTS_URL, 'attack_annotated_comments.tsv')
comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0, encoding='ISO-8859-1')['comment']
print(comments.head(20))
comments = comments.apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments = comments.apply(lambda x: x.replace("TAB_TOKEN", " "))

# transform into lowercase
comments = comments.apply(lambda x: " ".join(x.lower() for x in x.split()))
# print(comments.head(20))

# remove punctuations
comments = comments.str.replace('[^\w\s]','')
print(comments.head(20))

# remove emoji, references: https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)
comments = comments.apply(lambda x: remove_emoji(x))

# remove whitespace
comments = comments.apply(lambda x: x.strip())
print(comments.head(20))


rev_id
37675     `-NEWLINE_TOKENThis is not ``creative``.  Thos...
44816     `NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...
49851     NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...
89320      Next, maybe you could work on being less cond...
93890                  This page will need disambiguation. 
102817    NEWLINE_TOKEN-NEWLINE_TOKENNEWLINE_TOKENImport...
103624    I removed the following:NEWLINE_TOKENNEWLINE_T...
111032    `:If you ever claimed in a Judaic studies prog...
120283    NEWLINE_TOKENNEWLINE_TOKENNEWLINE_TOKENMy apol...
128532    `Someone wrote:NEWLINE_TOKENMore recognizable,...
133562    NEWLINE_TOKENNEWLINE_TOKEN:Correct. Full biogr...
138117    `NEWLINE_TOKENNEWLINE_TOKENCare should be take...
155243    NEWLINE_TOKENNEWLINE_TOKEN:If I may butt in  I...
177310    NEWLINE_TOKENNEWLINE_TOKENNEWLINE_TOKENOn my  ...
192579    `NEWLINE_TOKENNEWLINE_TOKEN:<>>NEWLINE_TOKENNE...
201190          gets far more tendentious yet.NEWLINE_TOKEN
208009    `NEWLINE_TOKENNEWLINE_T

In [127]:
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

aac = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0)
annotations = pd.read_csv('attack_annotations.tsv',  sep = '\t')

labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

# read training and test sets
train_test = aac.loc[aac['split'].isin(['train', 'test'])]
train_test['attack'] = labels
# print(train_test.head(50))

# simple text classifier from the strawman code

train_comments = train_test.query("split=='train'")
test_comments = train_test.query("split=='test'")

clf = Pipeline([
    ('vect', CountVectorizer(max_features = 10000, ngram_range = (1,2))),
    ('tfidf', TfidfTransformer(norm = 'l2')),
    ('clf', DecisionTreeClassifier(random_state = 123)),
])

X = train_test['comment']
y = train_test['attack']
# print(y.head(10))

rev_id
37675     False
44816     False
49851     False
93890     False
102817    False
103624    False
128532    False
133562    False
138117    False
155243    False
Name: attack, dtype: bool


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_test['attack'] = labels


In [None]:
import numpy as np

kfold = KFold(n_splits = 8, shuffle = True, random_state = 1)
for train, test in kfold.split(train_test):
    train_x, train_y, test_x, test_y = X.iloc[list(train)], y.iloc[list(train)], X.iloc[list(train)], y.iloc[list(train)]
    print(np.shape(test_x))
    print(np.shape(y))
    clf = clf.fit(train_x, train_y)
    print("TRAIN:", train, "TEST:", test)

In [140]:
# adding confusion matrix
cm = confusion_matrix(y, clf.predict(X))

print(cm)

[[81389   480]
 [  541 10294]]


In [134]:
len(train_test)

92704