In [1]:
import pandas as pd
import numpy as np
import string
import re
import random
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import word_tokenize, pos_tag, regexp_tokenize, TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer 
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [2]:
train = pd.read_csv('../train.csv')

In [3]:
# train['char_total'] = train['comment_text'].map(lambda x: len(x))
# train = train[train['char_total']<1000]
# train.drop(['char_total'], axis=1, inplace=True)

In [4]:
def preprocess(comment):
    # lower everything
    comment = comment.lower()
    #get rid of new line symbols
    comment = re.sub('\\n',' ',comment)
    #remove user:: fragments
    comment = re.sub("user::\w*",' ',comment)
    #remove anything with user
    comment = re.sub("\[\[user.*",' ',comment)
    #remove IP addresses
    comment = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",' ',comment)
    #remove http links
    comment = re.sub("(http://.*?\s)|(http://.*)",' ',comment)
    #remove https links
    comment = re.sub("(https://.*?\s)|(https://.*)",' ',comment)
    #remove email addresses
    comment = re.sub("(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)",' ',comment)
    #remove wp: fragments
    comment = re.sub("wp:\w*",' ',comment)
    #remove these auto generated strings
    comment = re.sub("preceding unsigned comment added by",' ',comment)
    #remove all punctuation besides '
    string_w_o_comma = re.sub("'", '', string.punctuation)
    comment = comment.translate(str.maketrans('','', string_w_o_comma))
    return comment

In [5]:
train['comment_text'] = train['comment_text'].apply(preprocess)

In [6]:
labels = list(train.columns[2:])
train['total'] = train[labels].sum(axis=1)

In [7]:
clean_comments_df = train[train['total']== 0]
toxic_comments_df = train[train['total'] > 0]

In [8]:
twt_tknzr = TweetTokenizer(preserve_case=False, reduce_len=True)
clean_tokens = [twt_tknzr.tokenize(x) for x in clean_comments_df['comment_text']]
toxic_tokens = [twt_tknzr.tokenize(x) for x in toxic_comments_df['comment_text']]

In [9]:
clean_for_tfidf = ' '.join(list(map(' '.join, clean_tokens)))
toxic_for_tfidf = ' '.join(list(map(' '.join, toxic_tokens)))

In [10]:
corpi = [clean_for_tfidf, toxic_for_tfidf]

In [11]:
stops = stopwords.words('english') + list(string.punctuation)
vectorizer = TfidfVectorizer(stop_words=stops, ngram_range=(2,2))
corpi_tfidf = vectorizer.fit_transform(corpi)

In [12]:
clean_corpus = corpi_tfidf[0]
df = pd.DataFrame(clean_corpus.T.todense(), index=vectorizer.get_feature_names(), columns=['tfidf'])
df = df[df['tfidf']>0]
df.sort_values(by=["tfidf"],ascending=False).head(10)

Unnamed: 0,tfidf
talk page,0.550088
speedy deletion,0.191796
would like,0.154218
fair use,0.118919
feel free,0.10116
blocked editing,0.100985
please stop,0.09125
talk pages,0.091162
criteria speedy,0.08856
talk contribs,0.083138


In [13]:
toxic_corpus = corpi_tfidf[1]
df = pd.DataFrame(toxic_corpus.T.todense(), index=vectorizer.get_feature_names(), columns=['tfidf'])
df = df[df['tfidf']>0]
df.sort_values(by=["tfidf"],ascending=False).head(10)

Unnamed: 0,tfidf
fuck fuck,0.327701
moron hi,0.224609
hi moron,0.224153
nigger nigger,0.218319
jew fat,0.187911
fat jew,0.186692
shit shit,0.175119
ass ass,0.171617
suck suck,0.153648
bark bark,0.152125
