# Preprocessing comments

- multi-task classification with 6 classes
- training set size: 159,571
- few examples (out of 159,571) associated with at least one of these classes

- max number of terms in a single comment is 1403

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook

In [2]:
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import EnglishStemmer
tokenizer=TreebankWordTokenizer()
lemmatizer=WordNetLemmatizer()
stemmer=EnglishStemmer()

### Load training and test data

In [3]:
train=pd.read_csv('./data/train.csv')
test=pd.read_csv('./data/test.csv')

## Processed data 1.0

- Tokenization (TreeBankWordTockenizer)
- Stemmer for English Language (snowball)

In [4]:
def processText1_0(inpt):
    inpt=inpt.lower()
    tokens=tokenizer.tokenize(inpt)
    stems=[stemmer.stem(token.decode('utf8')) for token in tokens]
    return ' '.join(stems)
    
train.comment_text=train.comment_text.apply(processText1_0)
test.comment_text=test.comment_text.apply(processText1_0)

In [5]:
train.to_csv('data/train_processed_1.0.csv', encoding='utf-8')
test.to_csv('data/test_processed_1.0.csv', encoding='utf-8')

## Processed data 2.0

- Tokenization (TreeBankWordTockenizer)
- lemmatizer (WordNet)

In [4]:
def processText2_0(inpt):
    inpt=inpt.lower()
    tokens=tokenizer.tokenize(inpt)
    lemmas=[lemmatizer.lemmatize(token.decode('utf8')) for token in tokens]
    return ' '.join(lemmas)
    
train.comment_text=train.comment_text.apply(processText2_0)
test.comment_text=test.comment_text.apply(processText2_0)

In [5]:
train.to_csv('data/train_processed_2.0.csv', encoding='utf-8')
test.to_csv('data/test_processed_2.0.csv', encoding='utf-8')