In [14]:
# Import python libraries 
import pandas as pd
import csv
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
import nltk
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from nltk.stem.snowball import SnowballStemmer

In [15]:
# Import test and train data
path = '/Users/zgalvin/Desktop/Kaggle Problem/train.csv'
pathTrain = '/Users/zgalvin/Desktop/Kaggle Problem/test.csv'

trainData = pd.read_csv(path, names=['ID', 'Text', 'Toxic','Severe_Toxic','Obscene', 'Threat', 'Insult', 'Identity_Hate'], low_memory = False, header=0)
testData = pd.read_csv(pathTrain, names=['ID','Text'],header=0,dtype={'ID': str})

trainData.head()

Unnamed: 0,ID,Text,Toxic,Severe_Toxic,Obscene,Threat,Insult,Identity_Hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [16]:
# Split the data up into X and y

# Start timer
start_time = time.time()

# Assign text to X and categories to y
X_train = trainData.Text
y_train = [trainData.Toxic, trainData.Severe_Toxic, trainData.Obscene, trainData.Threat, trainData.Insult, trainData.Identity_Hate]

# Assign text to X in the test set
X_test = testData.Text

elapsed_time = time.time() - start_time
print(elapsed_time)

0.005630970001220703


In [17]:
# Turn text to numbers

# Start timer
start_time = time.time()
'''
# Create Lemmatizer
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
'''        

# Fit and transform X into tokenized word counts
count_vect = CountVectorizer(stop_words='english', min_df=.0002, max_df=.9)
X_train = count_vect.fit_transform(X_train)
print(count_vect.get_feature_names())

# Create sparse matrix of character counts for each comment
count_vect_char = CountVectorizer(stop_words='english', analyzer='char')
X_train_char = count_vect_char.fit_transform(trainData.Text)

# Implement term frequency–inverse document frequency on the word counts (this did not help accuracy)
# tf_transformer = TfidfTransformer()
# X_train = tf_transformer.fit_transform(X_train_counts)

elapsed_time = time.time() - start_time
print(elapsed_time)



22.30486798286438


In [18]:
# Transform X_test

# Start timer
start_time = time.time()

# Transform X for the training data into tokenized word counts
X_test = count_vect.transform(X_test)

# Transform X in testing data througb frequency–inverse document frequency on the word counts (this did not help accuracy)
# X_test = tf_transformer.transform(X_test)

elapsed_time = time.time() - start_time
print(elapsed_time)

7.025486707687378


In [19]:
# Function to choose model so that editing models is easier (less copy and pasting)

# Start timer
start_time = time.time()

def chooseModel(model,y):
    if model == "nb":
        # use MultinomialNB
        mlb = MultinomialNB()
        print(elapsed_time)
        return mlb
    elif model == "sgd":
        # use SGDClassifier 
        sgd = SGDClassifier()
        print(elapsed_time)
        
        '''# Define parameters to optimize
        params = dict(loss=['log','modified_huber', 'squared_hinge', \
                            'perceptron', 'squared_loss', 'huber', \
                            'epsilon_insensitive', 'squared_epsilon_insensitive'],\
                            penalty=['none','l2','l1','elasticnet'], \
                            alpha=(1e-2, 1e-3), fit_intercept=[True,False]  
                     )
        
        rand = RandomizedSearchCV(sgd, params, n_iter=5, random_state=5)
        rand.fit(X_train, y_train[y])
        
        # Print out the optimal parameters 
        print(rand.best_params_)
        
        # Instatiate new model with optimized parameters
        sgd = SGDClassifier(loss=rand.best_params_['loss'], penalty=rand.best_params_['penalty'])
        '''
        return sgd
    elif model == "randTree":
        # Use RandomForestClassifier
        rfc = RandomForestClassifier(n_estimators=10)
        print(elapsed_time)
        return rfc
    elif model == "decTree":
        # Use DecisionTreeClassifier
        dtc = DecisionTreeClassifier()
        print(elapsed_time)
        return dtc
    else:
        print("Sorry! We have not implemented that model yet.")


In [20]:
# Test model on each category

# Start timer
start_time = time.time()

# Identity_Hate
mlb = chooseModel(model="nb",y=5).fit(X_train, y_train[5])
predicted_Identity_Hate = pd.Series(mlb.predict(X_test))

# Insult
mlb = chooseModel(model="nb",y=4).fit(X_train, y_train[4])
predicted_Insult = pd.Series(mlb.predict(X_test))

# Threat
mlb = chooseModel(model="nb",y=3).fit(X_train, y_train[3])
predicted_Threat = pd.Series(mlb.predict(X_test))

# Obscene
mlb = chooseModel(model="nb",y=2).fit(X_train, y_train[2])
predicted_Obscene = pd.Series(mlb.predict(X_test))

# Severly Toxic
mlb = chooseModel(model="nb",y=1).fit(X_train, y_train[1])
predicted_Severly_Toxic = pd.Series(mlb.predict(X_test))

# Toxic
mlb = chooseModel(model="nb",y=0).fit(X_train, y_train[0])
predicted_Toxic = pd.Series(mlb.predict(X_test))
'''
cols = ['predicted_Toxic','predicted_Severely_Toxic','predicted_Obscene','predicted_Threat','predicted_insult','predicted_Identity_Hate']
for i in range(len(y_train)):
    mlb = MultinomialNB().fit(X_train, y_train[i])
    # Transform X and then predict
    if i == 0:
        X_test = count_vect.transform(X_test)
        X_test = tf_transformer.transform(X_test)
    cols[i] = pd.Series(mlb.predict(X_test))
'''
elapsed_time = time.time() - start_time
print(elapsed_time)

7.025486707687378
7.025486707687378
7.025486707687378
7.025486707687378
7.025486707687378
7.025486707687378
0.48932480812072754


In [21]:
# Write data to csv file in Kaggle format

# Start timer
start_time = time.time()

# Create list of column names (How Kaggle wants it)
columnNames = 'id','toxic','severe_toxic','obscene', 'threat', 'insult', 'identity_hate'

# Concat the pandas series together to create a data frame
data = pd.concat([testData.ID,predicted_Toxic,predicted_Severly_Toxic,predicted_Obscene,predicted_Threat,predicted_Insult,predicted_Identity_Hate], axis=1)
data.head()

# Write data frame to the result.csv on desktop
data.to_csv(path_or_buf='/Users/zgalvin/Desktop/results.csv', index=False, header=columnNames)

elapsed_time = time.time() - start_time
print(elapsed_time)


1.0849940776824951
