# Environment Setup

In [1]:
from config import *
from test_bad_word import *
from utility import *

import time
import numpy as np
import pandas as pd
#pd.options.display.max_columns = None
#pd.options.display.mpl_style = 'default'

import re
import os
import sys
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import sparse

from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

## 1. Import raw data

In [2]:
df_train_1 = pd.read_csv(DATA_DIR + '/train.csv', encoding="ISO-8859-1")
df_train_2 = pd.read_csv(DATA_DIR + '/test_with_solutions.csv', encoding="ISO-8859-1")
df_train = pd.concat((df_train_1, df_train_2), axis=0, ignore_index=True)
df_train['length']=df_train['Comment'].map(lambda x:len(x.split()))
#df_train = df_train[df_train['length']<300]
df_test = pd.read_csv(DATA_DIR + '/impermium_verification_labels.csv', encoding="ISO-8859-1")

num_train = df_train.shape[0]
num_test = df_test.shape[0]

df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

In [3]:
df_all['Comment']=df_all['Comment'].map(lambda x:parser(x))
df_all['Comment']=df_all['Comment'].map(lambda x:badword_replacer(x))
df_all['Comment_stemmed']=df_all['Comment'].map(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

df_train['Comment']=df_train['Comment'].map(lambda x:parser(x))
df_train['Comment']=df_train['Comment'].map(lambda x:badword_replacer(x))
df_train['Comment_stemmed']=df_train['Comment'].map(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

df_test['Comment']=df_test['Comment'].map(lambda x:parser(x))
df_test['Comment']=df_test['Comment'].map(lambda x:badword_replacer(x))

In [4]:
bad_word_1  = [line.rstrip('\n') for line in open('wordlist/google_bad_word.txt')]
#bad_word_2  = [line.rstrip('\n') for line in open('handcrafted_badword.txt')]
bad_word= set(bad_word_1  + test_bad_word) 

In [5]:
df_dependency = pd.read_csv(PROCESSINGTEXT_DIR + '/dependency.csv')
df_dependency = df_dependency.drop_duplicates()
df_all = pd.merge(df_all, df_dependency, how='left', on='Comment')

## 2. Construct features

* sentiment features

In [6]:
df_all['length']=df_all['Comment'].map(lambda x:len(x.split()))
length = sparse.csr_matrix(df_all['length'].values).T

bad_word_1  = [line.rstrip('\n') for line in open('wordlist/google_bad_word.txt')]
#bad_word_2  = [line.rstrip('\n') for line in open('handcrafted_badword.txt')]
bad_word= set(bad_word_1  + test_bad_word) 

df_all['bad word count']=df_all['Comment'].map(lambda x:sum([word.lower() in bad_word for word in x.split()]))
df_all['bad word ratio']=df_all['bad word count']/df_all['length']

bad_word_count = sparse.csr_matrix(df_all['bad word count'].values).T
bad_word_ratio = sparse.csr_matrix(df_all['bad word ratio'].values).T

strong_pos  = [line.rstrip('\n') for line in open('wordlist/strong_pos.txt')]
strong_neg  = [line.rstrip('\n') for line in open('wordlist/strong_neg.txt')]
weak_pos  = [line.rstrip('\n') for line in open('wordlist/weak_pos.txt')]
weak_neg  = [line.rstrip('\n') for line in open('wordlist/weak_neg.txt')]

df_all['strong pos count']=df_all['Comment'].map(lambda x:sum([word.lower() in strong_pos for word in x.split()]))
df_all['strong pos ratio']=df_all['strong pos count']/df_all['length']
df_all['strong neg count']=df_all['Comment'].map(lambda x:sum([word.lower() in strong_neg for word in x.split()]))
df_all['strong neg ratio']=df_all['strong neg count']/df_all['length']
df_all['weak pos count']=df_all['Comment'].map(lambda x:sum([word.lower() in weak_pos for word in x.split()]))
df_all['weak neg count']=df_all['Comment'].map(lambda x:sum([word.lower() in weak_neg for word in x.split()]))
df_all['sentence score']= np.exp((-3*df_all['bad word count'] + (-2)*df_all['strong neg count']+ (-1)*df_all['weak neg count']\
                        + 1*df_all['weak pos count'] + 2 * df_all['strong pos count'])/df_all['length'])
 
sentence_score = sparse.csr_matrix(df_all['sentence score'].values).T    
strong_pos_count = sparse.csr_matrix(df_all['strong pos count'].values).T
strong_pos_ratio = sparse.csr_matrix(df_all['strong pos ratio'].values).T
strong_neg_count = sparse.csr_matrix(df_all['strong neg count'].values).T
strong_neg_ratio = sparse.csr_matrix(df_all['strong neg ratio'].values).T

df_all['capital count']=df_all['Comment'].map(lambda x:sum([1 if word.isupper() else 0 for word in x.split()]))
df_all['capital ratio']=df_all['capital count']/df_all['length']

df_all['average word length']=df_all['Comment'].map(lambda x: np.mean([len(word) for word in x.split()]))
df_all['max word length']=df_all['Comment'].map(lambda x: np.max([len(word) for word in x.split()]))

df_all['email']=df_all['Comment'].map(lambda x: np.sum([1 if word=='_email_' else 0 for word in x.split()]))
df_all['hashtag']=df_all['Comment'].map(lambda x: np.sum([1 if word=='_hashtag_' else 0 for word in x.split()]))
df_all['url']=df_all['Comment'].map(lambda x: np.sum([1 if word=='_url_' else 0 for word in x.split()]))
df_all['CR']=df_all['Comment'].map(lambda x: np.sum([1 if word=='_CR_' else 0 for word in x.split()]))

def youare_count(x):
    if re.search('you are',x.lower()):
        return 1
    else:
        return 0

df_all['you are']=df_all['Comment'].map(lambda x: youare_count(x))

capital_count = sparse.csr_matrix(df_all['capital count'].values).T
capital_ratio = sparse.csr_matrix(df_all['capital ratio'].values).T
average_word_length = sparse.csr_matrix(df_all['average word length'].values).T
max_word_length = sparse.csr_matrix(df_all['max word length'].values).T
email = sparse.csr_matrix(df_all['email'].values).T
hashtag = sparse.csr_matrix(df_all['hashtag'].values).T
url = sparse.csr_matrix(df_all['url'].values).T
CR = sparse.csr_matrix(df_all['CR'].values).T
you_are = sparse.csr_matrix(df_all['you are'].values).T


* tfidf/ count

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

documents = sentence_reconnect(df_all['Comment_stemmed'].values)

tfidf_char_ngram = TfidfVectorizer(ngram_range=(2, 5), min_df=3, analyzer='char') 
tfidf_char_ngram = tfidf_char_ngram.fit(documents)
tfidf_char_ngram_all = tfidf_char_ngram.transform(df_all['Comment_stemmed'].values)

In [8]:
tfidf_word_ngram = TfidfVectorizer(ngram_range=(1, 5), min_df=3, analyzer='word') 
tfidf_word_ngram = tfidf_word_ngram.fit(documents)
tfidf_word_ngram_all = tfidf_word_ngram.transform(df_all['Comment_stemmed'].values)

In [9]:
tfidf_word_ngram_all

<8829x24360 sparse matrix of type '<class 'numpy.float64'>'
	with 333200 stored elements in Compressed Sparse Row format>

* language models

In [10]:
import nltk
import lm
from nltk import word_tokenize
from nltk.util import ngrams

def ngram_compute(x,n=3):
    ngram_lst = []
    for comment in x:
        token = word_tokenize(comment)
        ngrams_token=ngrams(token,n)
        for i in ngrams_token:
            ngram_lst.append(i)
    return ngram_lst

def language_model_prob(x):
    prob_sum = 0
    for i in x:
        prob_sum += kneser_ney.prob(i)
    return prob_sum

def sentence_reconnect_lm(input):
    reconnects= []
    for text in input:
        phrases = re.split(r'[;:\.()\n]', text)
        phrases = [re.findall(r'[\w%\*&#]+', ph) for ph in phrases]
        phrases = [ph for ph in phrases if ph]
        for ph in phrases: 
            if ph:
                reconnects = reconnects + ph
    return reconnects 

def phrase(text):
    reconnects= []
    phrases = re.split(r'[;:\.()\n]', text)
    phrases = [re.findall(r'[\w%\*&#]+', ph) for ph in phrases]
    for ph in phrases: 
        if ph:
            reconnects = reconnects + ph
    return reconnects

In [11]:
language = sentence_reconnect(df_all['Comment_stemmed'].values)

trigram_lst = ngram_compute(language)
freq_dist = nltk.FreqDist(trigram_lst)
kneser_ney = nltk.KneserNeyProbDist(freq_dist)

df_all['lm_prob_nltk']=df_all['Comment_stemmed'].map(lambda x: np.log(language_model_prob(ngrams(word_tokenize(x),3))+0.5))

lm_prob_nltk = sparse.csr_matrix(df_all['lm_prob_nltk'].values).T



In [12]:
language_2 = sentence_reconnect_lm(df_all['Comment_stemmed'].values)

oov=lm.inject_OOVs(language_2)
vocab = set(language_2)

#KN_normal_1=lm.OOV_Modified_KneserNey(oov_normal,vocab,1) 
KN_2=lm.OOV_Modified_KneserNey(oov,vocab,2) 
KN_3=lm.OOV_Modified_KneserNey(oov,vocab,3,KN_2) 
KN_4=lm.OOV_Modified_KneserNey(oov,vocab,4,KN_3) 
KN_5=lm.OOV_Modified_KneserNey(oov,vocab,5,KN_4) 
KN_6=lm.OOV_Modified_KneserNey(oov,vocab,6,KN_5) 

In [13]:
df_all['lm_prob_2']=df_all['Comment_stemmed'].map(lambda x: np.log(lm.perplexity(KN_2, phrase(x))+0.5))
df_all['lm_prob_3']=df_all['Comment_stemmed'].map(lambda x: np.log(lm.perplexity(KN_3, phrase(x))+0.5))
df_all['lm_prob_4']=df_all['Comment_stemmed'].map(lambda x: np.log(lm.perplexity(KN_4, phrase(x))+0.5))
df_all['lm_prob_5']=df_all['Comment_stemmed'].map(lambda x: np.log(lm.perplexity(KN_5, phrase(x))+0.5))
df_all['lm_prob_6']=df_all['Comment_stemmed'].map(lambda x: np.log(lm.perplexity(KN_6, phrase(x))+0.5))

lm_prob_2 = sparse.csr_matrix(df_all['lm_prob_2'].values).T
lm_prob_3 = sparse.csr_matrix(df_all['lm_prob_3'].values).T
lm_prob_4 = sparse.csr_matrix(df_all['lm_prob_4'].values).T
lm_prob_5 = sparse.csr_matrix(df_all['lm_prob_5'].values).T
lm_prob_6 = sparse.csr_matrix(df_all['lm_prob_6'].values).T

* syntatic features

In [14]:
df_all['syn obsub']= df_all['Dependency'].map(lambda x: obsub(x,bad_word))
df_all['syn descriptive']= df_all['Dependency'].map(lambda x: descriptive(x,bad_word))
df_all['syn possession']= df_all['Dependency'].map(lambda x: possession(x,bad_word))
df_all['syn rhetorical']= df_all['Dependency'].map(lambda x: rhetorical(x,bad_word))
df_all['syn close']= df_all['Dependency'].map(lambda x: close_phrase(x,bad_word))
df_all['syn sum']=df_all['syn obsub'] + df_all['syn descriptive'] + df_all['syn possession'] + \
                  df_all['syn rhetorical'] + df_all['syn close']

syn_obsub = sparse.csr_matrix(df_all['syn obsub'].values).T
syn_descriptive = sparse.csr_matrix(df_all['syn descriptive'].values).T
syn_possession = sparse.csr_matrix(df_all['syn possession'].values).T
syn_rhetorical = sparse.csr_matrix(df_all['syn rhetorical'].values).T
syn_close = sparse.csr_matrix(df_all['syn close'].values).T
syn_sum = sparse.csr_matrix(df_all['syn sum'].values).T

* features combination

In [15]:
features = []
features.append(tfidf_char_ngram_all)
features.append(tfidf_word_ngram_all)
#features.append(tfidf_word_ngram_all_2)
#features.append(count_word_ngram_all)

features.append(bad_word_ratio)
features.append(strong_pos_ratio)
features.append(strong_neg_ratio)
features.append(strong_pos_count)
features.append(strong_neg_count)
features.append(sentence_score)
features.append(length)

features.append(capital_count) 
features.append(capital_ratio) 
features.append(average_word_length) 
features.append(max_word_length)
features.append(email) 
features.append(hashtag) 
features.append(url)
features.append(CR) 
features.append(you_are)

features.append(syn_obsub)
features.append(syn_descriptive)
features.append(syn_possession) 
features.append(syn_sum)

features = sparse.hstack(features).tocsr()
features

<8829x100421 sparse matrix of type '<class 'numpy.float64'>'
	with 3893772 stored elements in Compressed Sparse Row format>

In [16]:
from sklearn.feature_selection import SelectKBest, chi2
ch2 = SelectKBest(chi2, k=50000)
ch2.fit(features[:num_train], df_train['Insult'].values)
features_filtered = ch2.transform(features)

In [17]:
features = sparse.hstack([features_filtered, lm_prob_2, lm_prob_3, lm_prob_4, lm_prob_5, lm_prob_6 ]).tocsr()
features

<8829x50005 sparse matrix of type '<class 'numpy.float64'>'
	with 2820448 stored elements in Compressed Sparse Row format>

## 3. Models

* Define Train/Test Sets

In [18]:
np.random.seed(10)
val_list = np.random.choice(range(num_train), round(num_train/10))
train_list = list(set(range(num_train)).difference(set(val_list))) 

X_train = features[:num_train][train_list]
X_val = features[:num_train][val_list]
X_test = features[num_train:]

y_train = df_train['Insult'].values[train_list]
y_val =  df_train['Insult'].values[val_list]
y_test = df_test['Insult'].values

* Models

In [43]:
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC

t_1 = time.time()

clf = LogisticRegression(tol=1e-8, penalty='l2', C=0.5, class_weight = 'balanced')
clf.fit(X_train, y_train)
val_pred_prob = clf.predict_proba(X_val)[:,1]
val_pred = clf.predict(X_val)
test_pred_prob = clf.predict_proba(X_test)[:,1]
test_pred = clf.predict(X_test)

'model training time:',round((time.time()-t_1)/60,1) ,'minutes\n'

('model training time:', 0.1, 'minutes\n')

In [44]:
from sklearn.metrics import roc_auc_score, f1_score
f1_score(y_val, val_pred),roc_auc_score(y_val,val_pred_prob)

(0.74242424242424254, 0.92377188029361945)

In [40]:
from sklearn.metrics import roc_auc_score, f1_score
f1_score(y_test, test_pred),roc_auc_score(y_test,test_pred_prob)

(0.73756097560975609, 0.82767891363298873)

In [45]:
np.save('feature_pred_val_prob', val_pred_prob)
np.save('feature_pred_val', val_pred)
#np.save('feature_pred_test_prob', test_pred_prob)
#np.save('feature_pred_test', test_pred)

* Error analysis

In [None]:
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix_dict(matrix,rotation=45, outside_label=""):
    plt.imshow(matrix, interpolation='nearest', cmap=plt.cm.Blues)
    plt.colorbar()
    tick_marks = np.arange(0)
    plt.xticks(tick_marks, [0,1], rotation=rotation)
    plt.yticks(tick_marks, [0,1])
    
cm=confusion_matrix(y_test, test_pred, labels=None, sample_weight=None)
print(cm)
plot_confusion_matrix_dict(cm)

In [None]:
print('false posotive: ',predict_analysis(y_test, y_pred, y_pred_prob, 'fp', df_test)[50])
print('false posotive: ',predict_analysis(y_test, y_pred, y_pred_prob, 'fp', df_test)[3])
print('false negative: ',predict_analysis(y_test, y_pred, y_pred_prob, 'fn', df_test)[150])
print('false negative: ',predict_analysis(y_test, y_pred, y_pred_prob, 'fn', df_test)[2])
print('true posotive: ',predict_analysis(y_test, y_pred, y_pred_prob, 'tp', df_test)[50])
print('true negative: ',predict_analysis(y_test, y_pred, y_pred_prob, 'tn', df_test)[5])