In [1]:
import pandas as pd
import numpy as np
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from textblob import TextBlob
import re

PATH = '../../data/'

train = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'test.csv')

print(train.shape)

(159571, 8)


In [2]:
APO = {
    "aren't" : "are not",
    "can't" : "cannot",
    "couldn't" : "could not",
    "didn't" : "did not",
    "doesn't" : "does not",
    "don't" : "do not",
    "hadn't" : "had not",
    "hasn't" : "has not",
    "haven't" : "have not",
    "he'd" : "he would",
    "he'll" : "he will",
    "he's" : "he is",
    "i'd" : "i would",
    "i'd" : "i had",
    "i'll" : "i will",
    "i'm" : "i am",
    "im" : "i am",
    "isn't" : "is not",
    "it's" : "it is",
    "it'll":"it will",
    "i've" : "i have",
    "ive" : "i have",
    "let's" : "let us",
    "mightn't" : "might not",
    "mustn't" : "must not",
    "shan't" : "shall not",
    "she'd" : "she would",
    "she'll" : "she will",
    "she's" : "she is",
    "shouldn't" : "should not",
    "that's" : "that is",
    "there's" : "there is",
    "they'd" : "they would",
    "they'll" : "they will",
    "they're" : "they are",
    "they've" : "they have",
    "we'd" : "we would",
    "we're" : "we are",
    "weren't" : "were not",
    "we've" : "we have",
    "what'll" : "what will",
    "what're" : "what are",
    "what's" : "what is",
    "what've" : "what have",
    "where's" : "where is",
    "who'd" : "who would",
    "who'll" : "who will",
    "who're" : "who are",
    "who's" : "who is",
    "who've" : "who have",
    "won't" : "will not",
    "wouldn't" : "would not",
    "you'd" : "you would",
    "you'll" : "you will",
    "you're" : "you are",
    "you've" : "you have",
    "'re": " are",
    "wasn't": "was not",
    "we'll":" will",
    "didn't": "did not",
    "tryin'": "trying",
    "u" : "you",
    "r" : "are",
    "ur" : "you are",
    "fuckin" : "fucking"
}

In [4]:
lem = WordNetLemmatizer()
tok = TweetTokenizer()

def clean(comment):
    comment = comment.lower()
    comment = re.sub(r'\n+', ' ', comment)
    comment = re.sub('\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}', '',comment) # remove leaky elements like ip,user
    comment = re.sub('\[\[.*\]', '',comment)    #removing usernames
    comment = re.sub('[=",~]', '', comment)
    comment = re.sub('-', ' ', comment)
    text = tok.tokenize(comment)
    text = [word for word in text if not re.match(r'http:\/\/.*', word)]
    text = [APO[word] if word in APO else word for word in text]
    text = tok.tokenize(' '.join(text))
    text = [lem.lemmatize(word, 'v') for word in text]
    text = [lem.lemmatize(word, 'n') for word in text]
    text = ' '.join(text).lower()
    text = re.sub(r'\/', ' ', text)
    text = re.sub(r' +', ' ', text)
    if text == '': text = 'na'
    return text

def check(comment):
    b = TextBlob(comment)
    return str(b.correct())

# word count
def word_count(comment): return len(comment.split())
# unique word count
def unique_word_count(comment): return len(set(comment.split()))
# find the count of quesiton marks
def question_mark_count(comment): return len(re.findall(r'\?', comment))
# find the count of consecutive question marks (i.e. ??)
def multi_question_mark_count(comment): return len(re.findall(r'\?{2,}', comment))
# find the count of exclamation marks
def exclamation_mark_count(comment): return len(re.findall(r'!', comment))
# find the count of consecutive exclamation marks (i.e. !!)
def multi_exclamation_mark_count(comment): return len(re.findall(r'!{2,}', comment))
# find the count of uppercase letters
def uppercase_letter_count(comment): return len(re.findall(r'[A-Z]', comment))
# count ellipsis (3 or more . (i.e. ...))
def ellipsis_count(comment): return len(re.findall(r'\.{3,}', comment))
# count period and ellipsis
def period_count(comment): return len(re.findall(r'\.+', comment))
# count parentheses pairs
def parentheses_pair_count(comment): return len(re.findall(r'\(.*\)', comment))
# count special symbols
def special_symbol_count(comment): return len(re.findall(r'[\%\#\@\*\&\$]', comment))
# count period and change line
def sentence_count(comment): return len(re.findall(r'[\n+\.+\?+!+]', comment))

print('inplace na')
train['comment_text'].fillna('na', inplace=True)
test['comment_text'].fillna('na', inplace=True)

print('comment text cleaning')
train['comment_text_cleaned'] = train['comment_text'].apply(clean)
test['comment_text_cleaned'] = test['comment_text'].apply(clean)

print('correct train')
train['comment_text_correct'] = train['comment_text_cleaned'].apply(check)
print('correct test')
test['comment_text_correct'] = test['comment_text_cleaned'].apply(check)

# print('word count')
# train['word_count'] = train['comment_text'].apply(word_count)
# test['word_count'] = test['comment_text'].apply(word_count)
# train['cleaned_word_count'] = train['comment_text_cleaned'].apply(word_count)
# test['cleaned_word_count'] = test['comment_text_cleaned'].apply(word_count)

# print('unique word count')
# train['unique_word_count'] = train['comment_text'].apply(unique_word_count)
# test['unique_word_count'] = test['comment_text'].apply(unique_word_count)
# train['cleaned_unique_word_count'] = train['comment_text_cleaned'].apply(unique_word_count)
# test['cleaned_unique_word_count'] = test['comment_text_cleaned'].apply(unique_word_count)

# print('question marks')
# train['question_marks'] = train['comment_text'].apply(question_mark_count)
# test['question_marks'] = test['comment_text'].apply(question_mark_count)

# print('consecutive question marks')
# train['consecutive_question_marks'] = train['comment_text'].apply(multi_question_mark_count)
# test['consecutive_question_marks'] = test['comment_text'].apply(multi_question_mark_count)

# print('exclamation marks')
# train['exclamation_marks'] = train['comment_text'].apply(exclamation_mark_count)
# test['exclamation_marks'] = test['comment_text'].apply(exclamation_mark_count)

# print('consecutive exclamation marks')
# train['consecutive_exclamation_marks'] = train['comment_text'].apply(multi_exclamation_mark_count)
# test['consecutive_exclamation_marks'] = test['comment_text'].apply(multi_exclamation_mark_count)

# print('uppercase letters')
# train['uppercase_letters'] = train['comment_text'].apply(uppercase_letter_count)
# test['uppercase_letters'] = test['comment_text'].apply(uppercase_letter_count)

# print('ellipsis')
# train['ellipsis'] = train['comment_text'].apply(ellipsis_count)
# test['ellipsis'] = test['comment_text'].apply(ellipsis_count)

# print('period and ellipsis')
# train['period'] = train['comment_text'].apply(period_count)
# test['period'] = test['comment_text'].apply(period_count)

# print('parentheses pairs')
# train['parentheses_pair'] = train['comment_text'].apply(parentheses_pair_count)
# test['parentheses_pair'] = test['comment_text'].apply(parentheses_pair_count)

# print('special symbols')
# train['special_symbol'] = train['comment_text'].apply(special_symbol_count)
# test['special_symbol'] = test['comment_text'].apply(special_symbol_count)

# print('sentence count')
# train['sentence'] = train['comment_text'].apply(sentence_count)
# test['sentence'] = test['comment_text'].apply(sentence_count)

# print('upper_word_ratio')
# train['upper_word_ratio'] = train['uppercase_letters'] / (train['word_count'] + 1)
# test['upper_word_ratio'] = test['uppercase_letters'] / (test['word_count'] + 1)

# print('unique_word_ratio')
# train['unique_word_ratio'] = train['unique_word_count'] / (train['word_count'] + 1)
# test['unique_word_ratio'] = test['unique_word_count'] / (test['word_count'] + 1)

# print('mark_count_ratio')
# train['mark_count_ratio'] = (train['question_marks']+train['exclamation_marks']+train['special_symbol'])\
#                             /(train['word_count'] + 1)
# test['mark_count_ratio'] = (test['question_marks']+test['exclamation_marks']+test['special_symbol'])\
#                             /(test['word_count'] + 1)

print(train.columns.values)

inplace na
comment text cleaning


KeyboardInterrupt: 

In [None]:
train.to_csv(PATH + 'cleaned_train.csv')
test.to_csv(PATH + 'cleaned_test.csv')
print('done')