In [1]:
import pandas as pd
import numpy as np
import re,string
from textacy import preprocessing
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from textblob import Word
from textblob import TextBlob 

In [2]:
train_translated_0_to_40000 = pd.read_csv('processed_data/train_translated_0_to_40000.csv')
train_translated_40000_to_80000 = pd.read_csv('processed_data/train_translated_40000_to_80000.csv')
train_translated_80000_to_120000 = pd.read_csv('processed_data/train_translated_80000_to_120000.csv')
train_translated_120000_to_end = pd.read_csv('processed_data/train_translated_120000_to_end.csv')

test = pd.read_csv('processed_data/test_translated.csv')
test_y = pd.read_csv("original_data/test_labels.csv")

In [3]:
train = pd.concat([train_translated_0_to_40000, train_translated_40000_to_80000,
        train_translated_80000_to_120000, train_translated_120000_to_end], ignore_index=True, sort=False)

In [4]:
#labels for the test data; value of -1 indicates it was not used for scoring
#we reomve those rows with no actual labels
test_y = test_y.loc[test_y['toxic'] != -1]

### Data Preprocessing

In [5]:
train['comment_text'][30]

"How could I post before the block expires?  The funny thing is, you think I'm being uncivil!"

In [6]:
repl = {
    "&lt;3": " good ",
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    "&lt;3": " heart ",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    ":/": " worry ",
    ":&gt;": " angry ",
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    r"\br\b": "are",
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
    r"\bi'm\b": "i am",
    "m": "am",
    "r": "are",
    "u": "you",
    "haha": "ha",
    "hahaha": "ha",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "can not",
    "cannot": "can not",
    "i'm": "i am",
    "m": "am",
    "i'll" : "i will",
    "its" : "it is",
    "it's" : "it is",
    "'s" : " is",
    "that's" : "that is",
    "weren't" : "were not",
}

In [7]:
keys = [i for i in repl.keys()]

new_train_data = []
new_test_data = []
ltr = train["comment_text"].tolist()
lte = test["comment_text"].tolist()
for i in ltr:
    arr = str(i).split()
    xx = ""
    for j in arr:
        j = str(j).lower()
        if j[:4] == 'http' or j[:3] == 'www':
            continue
        if j in keys:
            # print("inn")
            j = repl[j]
        xx += j + " "
    new_train_data.append(xx)
for i in lte:
    arr = str(i).split()
    xx = ""
    for j in arr:
        j = str(j).lower()
        if j[:4] == 'http' or j[:3] == 'www':
            continue
        if j in keys:
            # print("inn")
            j = repl[j]
        xx += j + " "
    new_test_data.append(xx)
train["comment_text"] = new_train_data
test["comment_text"] = new_test_data

In [8]:
def process_text(text):
    
    #replace eamils 
    text = preprocessing.replace.replace_emails(text, replace_with='EMAIL')
    #replace URL
    text = preprocessing.replace.replace_urls(text, replace_with= 'URL')
    #replace hashtags
    text = preprocessing.replace.replace_hashtags(text, replace_with= 'TAG')
    
    #replace numnber and some characters
    text = text.replace('&', 'and')
    text = text.replace('@', 'at')
    text = text.replace('=', '')
    #remove numbers
    text = re.sub(r"[^a-z ]", " ", text)
    # Replace \\n
    text = re.sub('\\n',' ',text)
    #remove punctuation 
    text = preprocessing.remove.remove_punctuation(text)
    
    text = re.sub(r"\b\w{1,3}\b", " ",text)
    #normalize whitespace
    text = re.sub( '\s+', ' ', text)
    
    #remove stopwords
    stop_words = set(stopwords.words('english')) 
  
    word_tokens = word_tokenize(text) 

    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            #normalize repeatingchars
            w = re.sub(r'(.)\1+', r'\1\1', w)
            filtered_sentence.append(w)
    text = ' '.join([w for w in filtered_sentence])
    return(text)

In [9]:
new_commentTrain = []
new_commentTest = []
for i in train['comment_text']:
    new_commentTrain.append(process_text(i))
    
for j in test['comment_text']:
    new_commentTest.append(process_text(j))

train['comment_text'] = new_commentTrain
test['comment_text'] = new_commentTest

In [10]:
train.to_csv('processed_data/train_translated_cleaned.csv',index=None)
test.to_csv('processed_data/test_translated_cleaned.csv',index=None)
test_y.to_csv('processed_data/test_labels_cleaned.csv',index=None)