In [1]:
import sys
import time
import pickle
import pandas as pd
import numpy as np
from langdetect import detect
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from dictionary import contraction_map, unnecessary_patterns

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.externals import joblib
from sklearn.model_selection import KFold, cross_val_score

cv = KFold(n_splits=5, shuffle=True, random_state=0)

stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample = pd.read_csv('data/sample_submission.csv')

train_y = train.as_matrix()[:, 2:].astype('int32')

In [36]:
contraction_object = re.compile("|".join(contraction_map.keys()))
sub_patterns = '|'.join(unnecessary_patterns)

def expand_contraction(sentence, sub_map=contraction_map, sub_object=contraction_object):
    def matching_case(match):
        return sub_map[match.group(0)]    
    return sub_object.sub(matching_case, sentence)


def cleaning_text(text, sub_patterns=sub_patterns):
    text = re.sub(sub_patterns, ' ', text)
    text = re.sub('[0-9]+', 'NUM', text)
    return text.strip()


def tokenizer(sentence):
    tokenized_sentence = word_tokenize(sentence)
    return [wnl.lemmatize(token) for token in tokenized_sentence if token not in stop_words]


def preprocessing(dataset):
    comment_list = []
    for comment in dataset.comment_text:
        comment = comment.lower()
        comment_list.append(cleaning_text(expand_contraction(comment)))
    return comment_list


def persistence(fname, mode='load', obj=None):
    if mode == 'load':
        with open(fname, 'rb') as f:
            return pickle.load(f)
    elif mode == 'save' and obj is not None:
        with open(fname, 'wb') as f:
            pickle.dump(obj, f)
            

def make_submission(sample_sub, fname, prediction):
    idx = sample['id']
    columns = sample.columns.tolist()[1:]
    sub = pd.DataFrame(prediction, index=idx, columns=columns)
    sub.to_csv('submissions/{}.csv'.format(fname), index=True)

In [4]:
train_comment = persistence('train_comment.pkl', 'load')
test_comment = persistence('test_comment.pkl', 'load')

In [37]:
train_comment = preprocessing(train)

## GloVe Embedding

In [5]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors

glove_file = datapath('/Users/youncheol/Documents/projects/toxic-comment-classification-challenge/embedding/glove.twitter.27B.100d.txt')
tmp_file = get_tmpfile('glove_model.txt')

glove2word2vec(glove_file, tmp_file)

glove_model = KeyedVectors.load_word2vec_format(tmp_file)

In [None]:
# glove_model.vocab['motherfucker'].index
# glove_model.index2word[0]

In [None]:
tokenized_train = list(map(tokenizer, train_comment))

In [85]:
vocab_size = len(glove_model.vocab)
sample_size = len(tokenized_train)
median_length = np.median(list(map(len, tokenized_train)))

In [77]:
train_x = np.zeros((sample_size, median_length), dtype='int32')

for i, sentence in enumerate(tokenized_train):
    for j, word in enumerate(sentence):
        if j <= 17:
            try:
                train_x[i][j] = glove_model.vocab[word].index
            except KeyError:
                train_x[i][j] = vocab_size
        else:
            continue