# Environment Setup

In [1]:
from config import *
from test_bad_word import *
from parser_function import *

import time
import numpy as np
import pandas as pd
#pd.options.display.max_columns = None
#pd.options.display.mpl_style = 'default'

import re
import os
import sys
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import sparse

from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

## 1. Import raw data

In [185]:
df_train = pd.read_csv(DATA_DIR + '/train.csv', encoding="ISO-8859-1")
df_train['length']=df_train['Comment'].map(lambda x:len(x.split()))
df_train = df_train[df_train['length']<300]
df_test = pd.read_csv(DATA_DIR + '/test_with_solutions.csv', encoding="ISO-8859-1")

num_train = df_train.shape[0]
num_test = df_test.shape[0]

df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

In [186]:
df_all['Comment']=df_all['Comment'].map(lambda x:parser(x))
df_all['Comment']=df_all['Comment'].map(lambda x:badword_replacer(x))

#df_all['Comment']=df_all['Comment'].map(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

## 2. Construct features

* length

In [187]:
df_all['length']=df_all['Comment'].map(lambda x:len(x.split()))
length = sparse.csr_matrix(df_all['length'].values).T

* counts/ratios of bad/strong positive/strong negative words

In [188]:
bad_word_1  = [line.rstrip('\n') for line in open('google_bad_word.txt')]
#bad_word_2  = [line.rstrip('\n') for line in open('handcrafted_badword.txt')]
bad_word= set(bad_word_1  + test_bad_word) 

df_all['bad word count']=df_all['Comment'].map(lambda x:sum([word.lower() in bad_word for word in x.split()]))
df_all['bad word ratio']=df_all['bad word count']/df_all['length']

bad_word_count = sparse.csr_matrix(df_all['bad word count'].values).T
bad_word_ratio = sparse.csr_matrix(df_all['bad word ratio'].values).T

In [189]:
strong_pos  = [line.rstrip('\n') for line in open('strong_pos.txt')]
strong_neg  = [line.rstrip('\n') for line in open('strong_neg.txt')]
weak_pos  = [line.rstrip('\n') for line in open('weak_pos.txt')]
weak_neg  = [line.rstrip('\n') for line in open('weak_neg.txt')]

df_all['strong pos count']=df_all['Comment'].map(lambda x:sum([word.lower() in strong_pos for word in x.split()]))
df_all['strong pos ratio']=df_all['strong pos count']/df_all['length']
df_all['strong neg count']=df_all['Comment'].map(lambda x:sum([word.lower() in strong_neg for word in x.split()]))
df_all['strong neg ratio']=df_all['strong neg count']/df_all['length']

df_all['weak pos count']=df_all['Comment'].map(lambda x:sum([word.lower() in weak_pos for word in x.split()]))
df_all['weak neg count']=df_all['Comment'].map(lambda x:sum([word.lower() in weak_neg for word in x.split()]))

strong_pos_count = sparse.csr_matrix(df_all['strong pos count'].values).T
strong_pos_ratio = sparse.csr_matrix(df_all['strong pos ratio'].values).T
strong_neg_count = sparse.csr_matrix(df_all['strong neg count'].values).T
strong_neg_ratio = sparse.csr_matrix(df_all['strong neg ratio'].values).T

In [190]:
df_all['sentence score']= np.exp((-3*df_all['bad word count'] + (-2)*df_all['strong neg count']+ (-1)*df_all['weak neg count']\
                        + 1*df_all['weak pos count'] + 2 * df_all['strong pos count'])/df_all['length'])
 
sentence_score = sparse.csr_matrix(df_all['sentence score'].values).T    

* other binary/count features

In [191]:
df_all['capital count']=df_all['Comment'].map(lambda x:sum([1 if word.isupper() else 0 for word in x.split()]))
df_all['capital ratio']=df_all['capital count']/df_all['length']

df_all['average word length']=df_all['Comment'].map(lambda x: np.mean([len(word) for word in x.split()]))
df_all['max word length']=df_all['Comment'].map(lambda x: np.max([len(word) for word in x.split()]))

df_all['email']=df_all['Comment'].map(lambda x: np.sum([1 if word=='_email_' else 0 for word in x.split()]))
df_all['hashtag']=df_all['Comment'].map(lambda x: np.sum([1 if word=='_hashtag_' else 0 for word in x.split()]))
df_all['url']=df_all['Comment'].map(lambda x: np.sum([1 if word=='_url_' else 0 for word in x.split()]))
df_all['CR']=df_all['Comment'].map(lambda x: np.sum([1 if word=='_CR_' else 0 for word in x.split()]))

def youare_count(x):
    if re.search('you are',x.lower()):
        return 1
    else:
        return 0

df_all['you are']=df_all['Comment'].map(lambda x: youare_count(x))

capital_count = sparse.csr_matrix(df_all['capital count'].values).T
capital_ratio = sparse.csr_matrix(df_all['capital ratio'].values).T
average_word_length = sparse.csr_matrix(df_all['average word length'].values).T
max_word_length = sparse.csr_matrix(df_all['max word length'].values).T
email = sparse.csr_matrix(df_all['email'].values).T
hashtag = sparse.csr_matrix(df_all['hashtag'].values).T
url = sparse.csr_matrix(df_all['url'].values).T
CR = sparse.csr_matrix(df_all['CR'].values).T
you_are = sparse.csr_matrix(df_all['you are'].values).T


In [212]:
features = []

features.append(length)
features.append(bad_word_count)
features.append(bad_word_ratio)

#features.append(strong_pos_count)
features.append(strong_pos_ratio)
#features.append(strong_neg_count)
features.append(strong_neg_ratio)

features.append(sentence_score)

#features.append(capital_count) 
features.append(capital_ratio) 
features.append(average_word_length) 
features.append(max_word_length)
features.append(email) 
features.append(hashtag) 
features.append(url)
features.append(CR) 
features.append(you_are)

features = sparse.hstack(features).tocsr()

## 3. Define Train/Test Sets

In [213]:
X_train = features[:num_train]
X_test = features[num_train:]

y_train = df_train['Insult'].values
y_test = df_test['Insult'].values

* feature selection

In [214]:
from sklearn.feature_selection import SelectKBest, chi2
ch2 = SelectKBest(chi2, k=12)
X_train = ch2.fit_transform(X_train, y_train)
X_test = ch2.transform(X_test)

In [221]:
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

t_1 = time.time()

#clf = RandomForestClassifier(n_estimators=2500, max_depth=15, random_state=2017)
clf = LogisticRegression(tol=1e-8, penalty='l2', C=1.5, class_weight = 'balanced')
#rf = RandomForestRegressor(n_estimators=250, max_depth=15, random_state=0)
#clf = BaggingRegressor(rf, n_estimators=145, max_samples=0.1, random_state=25)
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)[:,1]

'model training time:',round((time.time()-t_1)/60,1) ,'minutes\n'

('model training time:', 0.0, 'minutes\n')

In [222]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_pred)

0.86200246358895283

In [197]:
from sklearn.svm import LinearSVC
t_1 = time.time()

clf = LinearSVC(C=0.1, class_weight={0:1,1:5})
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

'model training time:',round((time.time()-t_1)/60,1) ,'minutes\n'

('model training time:', 0.0, 'minutes\n')

In [198]:
sum(y_pred==y_test)/len(y_test)

0.76577257272383836