In [0]:
import os
import re
import csv
import sys
import numpy as np
import pandas as pd
import spacy
import operator
import nltk
import multiprocessing as mp
from translate import Translator
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from collections import defaultdict
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [0]:
#nltk.download('stopwords')

In [0]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [0]:
nrow_train=train.shape[0]
nrow_test=test.shape[0]
print("train:",nrow_train, "rows")
print("test :",nrow_test, "rows")

train: 159571 rows
test : 153164 rows


In [0]:
# Check null values is train and test
#train.isnull().any()
#test.isnull().any()

In [0]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


### Data Cleaning

#### 1. Lowercase 2.Fix typo 3.Remove numbers, odd characters, stopwords, image, templates 4.Replace numbers with language

In [0]:
# Load the cleaned words
cl_path = 'cleanwords.txt'
clean_word_dict = {}
with open(cl_path, 'r', encoding='utf-8') as cl:
    for line in cl:
        line = line.strip('\n')
        typo, correct = line.split(',')
        clean_word_dict[typo] = correct

In [0]:
stop_words = stopwords.words('english')
def clean_text(text):
    text = text.lower()
    # Removed url links and ip addresses
    text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
    text = re.sub(r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", text)
    
    # Remove stopwords
    text = " ".join([w for w in text.split() if w not in stop_words])
    
    # Remove all punctuations
    #text = text.str.replace('[^\w\s]','')
    
    # Clean typo(NOT misspelling)
    for typo, correct in clean_word_dict.items():
        text = re.sub(typo, " " + correct + " ", text)

    # Clean other words
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"&", "and", text)
    text = re.sub(r"@", "at", text)
    # Replaced words that have a special symbol in between them 
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\?", " ? ", text)
    text = re.sub(r"\!", " ! ", text)
    text = re.sub(r"\"", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Replace numbers with language
    text = text.replace('0', ' zero ')
    text = text.replace('1', ' one ')
    text = text.replace('2', ' two ')
    text = text.replace('3', ' three ')
    text = text.replace('4', ' four ')
    text = text.replace('5', ' five ')
    text = text.replace('6', ' six ')
    text = text.replace('7', ' seven ')
    text = text.replace('8', ' eight ')
    text = text.replace('9', ' nine ')
    
    # Drop the image
    text = re.sub(r"image:[a-zA-Z0-9]*\.jpg", " ", text)
    text = re.sub(r"image:[a-zA-Z0-9]*\.png", " ", text)
    text = re.sub(r"image:[a-zA-Z0-9]*\.gif", " ", text)
    text = re.sub(r"image:[a-zA-Z0-9]*\.bmp", " ", text)

    # Drop CSS
    text = re.sub(r"#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})", " ",text)
    text = re.sub(r"\{\|[^\}]*\|\}", " ", text)
        
    # Clean templates
    text = re.sub(r"\[?\[user:.*\]", " ", text)
    text = re.sub(r"\[?\[user:.*\|", " ", text)        
    text = re.sub(r"\[?\[wikipedia:.*\]", " ", text)
    text = re.sub(r"\[?\[wikipedia:.*\|", " ", text)
    text = re.sub(r"\[?\[special:.*\]", " ", text)
    text = re.sub(r"\[?\[special:.*\|", " ", text)
    text = re.sub(r"\[?\[category:.*\]", " ", text)
    text = re.sub(r"\[?\[category:.*\|", " ", text)
    
    return (text)

In [0]:
list_sentences_train = train["comment_text"].fillna("no comment").values
list_sentences_test = test["comment_text"].fillna("no comment").values

train['comment_text'] = train['comment_text'].apply(clean_text)
test['comment_text'] = test['comment_text'].apply(clean_text)

In [0]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edits made username hardcore metal...,0,0,0,0,0,0
1,000103f0d9cfb60f,d aww ! matches background colour i am seeming...,0,0,0,0,0,0
2,000113f07ec002fd,hey man i am really trying edit war guy consta...,0,0,0,0,0,0
3,0001b41b1c6bb37e,cannot make real suggestions improvement - wo...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir hero chance remember page that on ?,0,0,0,0,0,0


### Translation

In [0]:
translator = Translator(to_lang = 'en')
train['comment_text'] = train['comment_text'].apply(translator.translate)
test['comment_text'] = test['comment_text'].apply(translator.translate)

### Lemmatization

In [0]:
def lemmatize(text):
    text = nlp(text)
    lemmatized = list()
    for word in text:
        lemma = word.lemma_.strip()
        lemmatized.append(lemma)
    return " ".join(lemmatized)

In [0]:
nlp = spacy.load('en')
train['comment_text'] = train['comment_text'].apply(lemmatize)
test['comment_text'] = test['comment_text'].apply(lemmatize)

### Stemming - might be too aggressive to use

In [0]:
#def stemming(text):
#    text = text.split()
#    stemmer = SnowballStemmer('english')
#    stemmed_words = [stemmer.stem(word) for word in text]
#    text = " ".join(stemmed_words)
#    return text

In [0]:
#train['comment_text'] = train['comment_text'].apply(stemming)
#test['comment_text'] = test['comment_text'].apply(stemming)

### Pos taggings
Generate the part of speech (POS) tagging for every comment by TextBlob and concatenate the word embedding and POS embedding as a single one. 
Since TextBlob drops some tokens and punctuations when generating the POS sequences, this may give our models another view.

In [0]:
def get_pos(x):
    tokens = nltk.tokenize.word_tokenize(x)
    tags = nltk.pos_tag(tokens)
    return tags

In [0]:
pool = mp.Pool(4)
train_pos = pool.map(get_pos, train['comment_text'])
test_pos = pool.map(get_pos, test['comment_text'])
pool.terminate()  

In [0]:
# save results
train_pos = pd.DataFrame({'pos': train_pos})
train_pos.to_csv('train_pos.csv', index=False)
test_pos = pd.DataFrame({'pos': test_pos})
test_pos.to_csv('test_pos.csv', index=False)

### Feature engineering

In [0]:
def add_features(df):
    df['comment_text'] = df['comment_text'].apply(lambda x:str(x))
    df['total_length'] = df['comment_text'].apply(len)+1
    df['capitals'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']),axis=1)
    df['num_words'] = df.comment_text.str.count('\S+')
    df['num_unique_words'] = df['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']  
    df[['caps_vs_length', 'words_vs_unique']] = df[['caps_vs_length', 'words_vs_unique']].fillna(0)
    return df

In [0]:
train = add_features(train)
test = add_features(test)

In [0]:
train.head()

In [0]:
train.to_csv('clean_train.csv')

In [0]:
test.to_csv('clean_test.csv')