#### Aim of this notebook to do feature engineering for Toxic comment challenges

In [1]:
import pandas as pd 
import numpy as np
from utils import *
import gensim
import re
from collections import Counter
import time
import warnings
warnings.filterwarnings("ignore")

In [2]:
path  = 'input_data/'
train = pd.read_csv(path+'train.csv')
test  = pd.read_csv(path+'test.csv')

#### Create a column name 'clean' 
* Whenever there is no tag, then we have tagged that comment as a 'clean' comment

In [3]:
rowsums       = train.iloc[:,2:].sum(axis=1)
train['clean']= (rowsums==0)*1

In [4]:
train = reduce_memory(train)
test  = reduce_memory(test)

Mem. usage decreased to  3.50 Mb (66.2% reduction)
Mem. usage decreased to  2.34 Mb (0.0% reduction)


#### Get raw feature counts - Count features before cleaning

1. Count of stopwords percentage within in that comment
2. Count of words in each comment
3. Count of caps and percentage within in that comment
4. Count of symbol and percentage within in that comment
5. Count of happy and percentage within in that comment
6. Count of sad and percentage within in that comment
7. Count of unique words and percentage within in that comment
8. Count of punctuation and percentage within in that comment
9. Count of exclamation marks
10. Count of new lines
11. Find ip address from comment and boolean feature of ip

In [5]:
start_time = time.time()
train = feature_creation_before_cleaning(train)
test  = feature_creation_before_cleaning(test)

print("total time in minutes:", (time.time() - start_time)/60.0)

total time in minutes: 1.7415376504262288


#### Data cleaning
1. a. Replace smilyes with 'happy', 'sad' b. Replace short forms with full form c. Replace yay, haha with 'happy'
2. Remove stopword
3. Remove digits
4. Remove puncuation
5. Remove new line symbol
6. Remove tense from word in comment
7. Other cleaning of comment such as lowering case, removing username

In [6]:
start_time = time.time()
train = data_cleaning(train)
test  = data_cleaning(test)

print("total time in minutes:", (time.time() - start_time)/60.0)

total time in minutes: 2.136250933011373


#### Spelling check
This repository hosts the word2vec pre-trained Google News corpus (3 billion running words) word vector model (3 million 300-dimension English word vectors). It is mirroring the data from the official word2vec website:
GoogleNews-vectors-negative300.bin.gz

This kernel requires to download Google's word2vec: https://github.com/mmihaltz/word2vec-GoogleNews-vectors

In [7]:
# model = gensim.models.KeyedVectors.load_word2vec_format('google_data/GoogleNews-vectors-negative300.bin.gz', 
#                                                         binary=True)
# words = model.index2word

# w_rank = {}
# for i,word in enumerate(words):
#     w_rank[word] = i

# WORDS = w_rank

# wordcloud1 = [comment.split() for comment in train['comment_text']]
# wordcloud2 = [comment.split() for comment in test['comment_text']]
# wordcloud  = wordcloud1 + wordcloud2
# wordcloud_flat = [item for sublist in wordcloud for item in sublist]
# wordcloud_unique = list(set(wordcloud_flat))

# wordcloud_dict = {word:correction(word, WORDS) for word in wordcloud_unique}

# train['comment_text'] = train['comment_text'].apply(lambda comment : get_spell_check(comment.split(), wordcloud_dict))
# test['comment_text']  = test['comment_text'].apply(lambda comment : get_spell_check(comment.split(), wordcloud_dict))

#### Feature Creation after clenaing
1. Count repeated words ( number of repeated words in a single comment)
2. Count number of adjectives, noun, verb
3. Count of website mentioned in the comment

In [8]:
start_time = time.time()
train = feature_creation_after_cleaning(train)
test  = feature_creation_after_cleaning(test) 
print("total time in minutes:", (time.time() - start_time)/60.0)

total time in minutes: 20.4772109746933


In [9]:
## We can start modeling from here for word-embeding, naive-bayes classifier... Frequency features are for exploration/different modeling

In [10]:
train.to_csv('train_clean.csv')
test.to_csv('test_clean.csv')

#### Frequency feature
1. Frequency count features
2. Polynomial features (?)

In [11]:
id_f = top_freq_word('identity_hate', train)
id_f

Unnamed: 0,Freq
nigger,2963
fat,1321
jew,1307
fuck,1234
gay,911
die,884
faggot,695
suck,517
huge,429
shit,409


In [12]:
to_f = top_freq_word('toxic',train)
to_f

Unnamed: 0,Freq
fuck,10624
suck,4032
shit,3608
like,3600
nigger,3289
wikipedia,3261
hate,2614
faggot,2608
ass,2596
know,2440


In [13]:
o_f = top_freq_word('obscene',train)
o_f

Unnamed: 0,Freq
fuck,10443
suck,3572
shit,3201
nigger,2721
ass,2517
faggot,1811
wikipedia,1636
like,1603
cunt,1517
bitch,1430


In [14]:
th_f = top_freq_word('threat',train)
th_f

Unnamed: 0,Freq
die,1174
ass,769
kill,517
fuck,245
block,184
jim,158
wales,157
supertr0ll,149
page,135
ban,134


In [15]:
in_f = top_freq_word('insult',train)
in_f

Unnamed: 0,Freq
fuck,8190
suck,3322
nigger,2774
faggot,2450
fat,1924
ass,1886
shit,1681
like,1611
moron,1458
cunt,1430
