# Insult Classification

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
import pandas as pd
data= pd.read_csv('train-utf8.csv')
data.head()

Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,You fuck your dad.
1,0,20120528192215Z,i really don't understand your point. It seem...
2,0,,A majority of Canadians can and has been wrong...
3,0,,listen if you dont wanna get married to a man ...
4,0,20120619094753Z,Các bạn xuống đường biểu tình 2011 có ôn hoà k...


In [3]:
print ("%d comments, of which %d insults (%d%%)" % \
    (len(data), data.Insult.sum(), 100 * data.Insult.mean()))

3947 comments, of which 1049 insults (26%)


### Looking for known bad words

In [4]:
#Load google_badlist.txt
filename = 'google_badlist.txt'
bad_words = pd.read_csv(filename, header=None, sep=r"\n", engine = 'python')

In [5]:
word_list = bad_words[0].values.tolist()

In [7]:
#Function to flag comments
def check_badword(comment, word_list):
    if any([word in comment.split() for word in word_list]):
        return 1
    else:
        return 0

In [8]:
#Add a column to data with a flag (0 or 1) if the comment contains a bad word
data['flag'] = data.Comment.apply(lambda x: check_badword(x, word_list))

In [10]:
data.head()

Unnamed: 0,Insult,Date,Comment,flag
0,1,20120618192155Z,You fuck your dad.,1
1,0,20120528192215Z,i really don't understand your point. It seem...,0
2,0,,A majority of Canadians can and has been wrong...,0
3,0,,listen if you dont wanna get married to a man ...,0
4,0,20120619094753Z,Các bạn xuống đường biểu tình 2011 có ôn hoà k...,0


## Metrics

In [11]:
accuracy = 100 * (float(len(data[(data.Insult == data.flag)])) / len(data))

In [12]:
print "Accuracy of the method is {0:0.03f}%".format(accuracy)
print "The accuracy is not acceptable, it does not look good!!"

Accuracy of the method is 70.813%
The accuracy is not acceptable, it does not look good!!


In [13]:
print(metrics.classification_report(data.Insult, data.flag))

             precision    recall  f1-score   support

          0       0.77      0.87      0.81      2898
          1       0.42      0.27      0.33      1049

avg / total       0.68      0.71      0.69      3947



## Comparing to a Naive Bayes Classifier

In [14]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(data.Comment)

In [15]:
# Taking 40% of data as training set
X_train, X_test, y_train, y_test = train_test_split(
X_train_counts, data.Insult, test_size=0.4, random_state=0)

In [16]:
clf_m1 = MultinomialNB()

In [17]:
clf_m1.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [18]:
score = clf_m1.score(X_test, y_test) * 100

In [19]:
print "Accuracy:", score,"%"

Accuracy: 78.3407219759 %


In [20]:
predicted = clf_m1.predict(X_test)

In [21]:
print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.83      0.88      0.86      1153
          1       0.62      0.53      0.57       426

avg / total       0.77      0.78      0.78      1579



## Verdict

### The verdict is based on the following assumption: The model's affectiveness depends on how well insulting are flagged out and thus Precision-RECALL are the most important metric.

### Therefore, the Naive Bayes classifier is way better recalling 53% of insults compared to 27% in case of our original method.

### Learning bad words on the fly

In [20]:
path_to_insults = 'data/'
data = pd.read_csv(path_to_insults + 'train-utf8.csv')
data.head()

Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,You fuck your dad.
1,0,20120528192215Z,i really don't understand your point. It seem...
2,0,,A majority of Canadians can and has been wrong...
3,0,,listen if you dont wanna get married to a man ...
4,0,20120619094753Z,Các bạn xuống đường biểu tình 2011 có ôn hoà k...


In [21]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(data.Comment)

In [22]:
X_train_counts

<3947x15457 sparse matrix of type '<type 'numpy.int64'>'
	with 98535 stored elements in Compressed Sparse Row format>

In [23]:
clf_m = MultinomialNB()

In [24]:
clf_m.fit(X_train_counts, data.Insult)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [25]:
clf_b = BernoulliNB()

In [26]:
clf_b.fit(X_train_counts, data.Insult)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [27]:
#Function to calculate Metrics for given Classifiers

In [28]:
def score_values(classifier, metrics, X, Y):
    print classifier, "\n" 
    for metric in metrics:
        scores = cross_val_score(classifier, X_train_counts, data.Insult, cv=5, 
                             scoring = metric)
        print(metric + ":%0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

## MultinomialNB

In [29]:
score_values(MultinomialNB(), ['accuracy', 'precision', 'f1', 'recall', 'roc_auc'], X_train_counts, data.Insult)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) 

accuracy:0.79 (+/- 0.01)
precision:0.59 (+/- 0.02)
f1:0.61 (+/- 0.03)
recall:0.64 (+/- 0.06)
roc_auc:0.81 (+/- 0.02)


## BernoulliNB

In [30]:
score_values(BernoulliNB(), ['accuracy', 'precision', 'f1', 'recall', 'roc_auc'], X_train_counts, data.Insult)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) 

accuracy:0.76 (+/- 0.02)
precision:0.65 (+/- 0.11)
f1:0.28 (+/- 0.06)
recall:0.18 (+/- 0.04)
roc_auc:0.83 (+/- 0.02)
