In [1]:
import nltk, re

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [2]:
test = pd.read_csv('data/test.csv')
train = pd.read_csv('data/train.csv')
labels = pd.read_csv('data/test_labels.csv')

In [3]:
# total data
df = train.merge(test, on=['id', 'comment_text'], how='left')
#df

In [4]:
# check duplication
sum(df[['id', 'comment_text']].duplicated())

0

In [5]:
# scale up severe_toxic 
df['severe_toxic'] = df['severe_toxic'] * 2
# sum features
df['y'] = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)
# normalization
df['y'] = df['y']/df['y'].max()
# rearrange df
df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})

In [6]:
df

Unnamed: 0,text,y
0,Explanation\nWhy the edits made under my usern...,0.0
1,D'aww! He matches this background colour I'm s...,0.0
2,"Hey man, I'm really not trying to edit war. It...",0.0
3,"""\nMore\nI can't make any real suggestions on ...",0.0
4,"You, sir, are my hero. Any chance you remember...",0.0
...,...,...
159566,""":::::And for the second time of asking, when ...",0.0
159567,You should be ashamed of yourself \n\nThat is ...,0.0
159568,"Spitzer \n\nUmm, theres no actual article for ...",0.0
159569,And it looks like it was actually you who put ...,0.0


In [7]:
# take first 80% comments as train data
split_ratio = 0.2
x, y = df['text'], df['y']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=split_ratio, random_state=42)

In [8]:
# download nltk data
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')

In [9]:
# create a stopword library
stopwords = nltk.corpus.stopwords.words('english')

In [10]:
lmtzr = WordNetLemmatizer()

def text_process(text):
    tokens = []
    # remove stopwords and only keep alphabetical words
    for word in nltk.word_tokenize(text):
        # replace 's, n't, 've
        word = word.replace("'s'", '').replace("n't", ' not').replace("'ve", ' have')
        word = re.sub(r'[^a-zA-Z0-9 ]', '', word)
        if word.lower() not in stopwords and word.isalpha():
            tokens.append(word.lower())
    
    # lemmatize
    return [lmtzr.lemmatize(token, 'v') for token in tokens]

In [15]:
# create tfidf matrix

# max_df float or int, default=1.0
# When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold 
# (corpus-specific stop words). If float in range [0.0, 1.0], the parameter represents a proportion of documents, integer 
# absolute counts. This parameter is ignored if vocabulary is not None.

# min_df float or int, default=1
# When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This 
# value is also called cut-off in the literature. If float in range of [0.0, 1.0], the parameter represents a proportion 
# of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

# use_idf bool, default=True
# Enable inverse-document-frequency reweighting. If False, idf(t) = 1.

# tokenizer callable, default=None
# Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Only applies if 
# analyzer == 'word'.

pipeline = Pipeline([
    ('bow', TfidfVectorizer(min_df=0.001, 
                            max_df=0.999, 
                            use_idf=True, 
                            analyzer=text_process, 
                            stop_words='english',
                            ngram_range = (1,2))),
    ('tfidf', TfidfTransformer()),
    ('elastic net', Ridge(alpha=1.0))
])

In [16]:
x = df['text']
y = df['y']

In [17]:
pipeline.fit(x, y)

Pipeline(steps=[('bow',
                 TfidfVectorizer(analyzer=<function text_process at 0x177953b80>,
                                 max_df=0.999, min_df=0.001, ngram_range=(1, 2),
                                 stop_words='english')),
                ('tfidf', TfidfTransformer()), ('elastic net', Ridge())])

In [18]:
validation = pd.read_csv('data/validation_data.csv')

In [19]:
validation

Unnamed: 0,worker,less_toxic,more_toxic
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu..."
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist"
...,...,...,...
30103,461,I'm sorry. I'm not an admin. I will give you t...,get out my large penis
30104,527,I'm sorry. I'm not an admin. I will give you t...,get out my large penis
30105,352,"wow...\nare you out of your mind, how was my e...",Piss off you slant eyed-gook
30106,311,"wow...\nare you out of your mind, how was my e...",Piss off you slant eyed-gook


In [20]:
validation['less_toxic_score'] = pipeline.predict(validation['less_toxic'])
validation['more_toxic_score'] = pipeline.predict(validation['more_toxic'])

In [21]:
validation

Unnamed: 0,worker,less_toxic,more_toxic,less_toxic_score,more_toxic_score
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...,0.422244,0.027073
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...,0.009274,0.160461
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu...",-0.000397,0.016472
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...,0.078383,0.385736
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist",0.002649,0.186877
...,...,...,...,...,...
30103,461,I'm sorry. I'm not an admin. I will give you t...,get out my large penis,0.070443,0.290369
30104,527,I'm sorry. I'm not an admin. I will give you t...,get out my large penis,0.070443,0.290369
30105,352,"wow...\nare you out of your mind, how was my e...",Piss off you slant eyed-gook,0.013450,0.209693
30106,311,"wow...\nare you out of your mind, how was my e...",Piss off you slant eyed-gook,0.013450,0.209693
