In [1]:
import string
import re
import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score, recall_score

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [4]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [6]:
train.shape

(159571, 8)

In [7]:
test.shape

(153164, 2)

In [8]:
x_train = train['comment_text']
y_train = train.iloc[:,2:]

In [9]:
#Split to training and validation data
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.15, random_state=22)

In [10]:
#Text Preprocessing
REPLACE_BY_SPACE = re.compile('[/(){}\[\]\|@,;]')
GOOD_SYMBOLS = re.compile('[^0-9a-z #+_]') 
STOPWORDS = set(stopwords.words('english'))

def process_text(text):
    text = text.lower()
    text = REPLACE_BY_SPACE.sub(r' ', text)
    text = GOOD_SYMBOLS.sub(r'', text)
    text = ' '.join([w for w in text.split() if w not in STOPWORDS ])
    return text.strip()

#### Preprocess the data:

In [11]:
x_train = [process_text(s) for s in x_train]
x_val = [process_text(s) for s in x_val]

In [12]:
def tfidf_features(xtrain, xval):
    vectorizer = TfidfVectorizer(token_pattern='(\S+)', ngram_range=(1,2), min_df=2, max_df=0.9)
    xtrain = vectorizer.fit_transform(xtrain)
    xval = vectorizer.transform(xval)
    return xtrain, xval, vectorizer.vocabulary_

def train_classifiers(xtrain, ytrain, C_=1.0):
    model = OneVsRestClassifier(LogisticRegression(solver='lbfgs', penalty='l2', C=C_))
    model.fit(xtrain,ytrain)
    return model

def evaluation_scores(y_val, preds_val):
    print("Accuracy: ", accuracy_score(y_val, preds_val))
    print("F1_score macro: ", f1_score(y_val, preds_val, average='macro'))
    print("F1_score micro: ", f1_score(y_val, preds_val, average='micro'))
    print("F1_score weighted: ", f1_score(y_val, preds_val, average='weighted'))   
    return ""

In [13]:
x_train_tfidf, x_val_tfidf, vocab_tfidf = tfidf_features(x_train, x_val)

In [14]:
clf_model_tfidf = train_classifiers(x_train_tfidf, y_train)

#### Predict with validation data and evaluation:

In [15]:
val_preds = clf_model_tfidf.predict(x_val_tfidf)

In [16]:
print(evaluation_scores(y_val, val_preds))

Accuracy:  0.9156082887700535
F1_score macro:  0.44823064374580524
F1_score micro:  0.6471421823334907
F1_score weighted:  0.6382313161942562



#### The Best C parameter:

In [33]:
C_values = [0.01, 0.1, 1.0, 10, 100]

Accuracy_list = []
F1_scores_weighted = []

for C_ in C_values:
    clf = train_classifiers(x_train_tfidf, y_train, C_)
    preds = clf.predict(x_val_tfidf)
    F1_scores_weighted.append(f1_score(y_val, preds, average='weighted'))
    Accuracy_list.append(accuracy_score(y_val, preds))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [34]:
F1_scores_weighted

[0.012133637237484085,
 0.39950937736137515,
 0.6437209096378061,
 0.6964511607256868,
 0.6878346994613364]

In [22]:
#C = 10
clf = train_classifiers(x_train_tfidf, y_train, 10)



In [23]:
val_preds_2 = clf.predict(x_val_tfidf)

In [24]:
print(evaluation_scores(y_val, val_preds_2))

Accuracy:  0.9189505347593583
F1_score macro:  0.5351292599645192
F1_score micro:  0.7069616837560714
F1_score weighted:  0.7011340657739019

