In [222]:
# Import python libraries 
import pandas as pd
import csv
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
import nltk
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from nltk.stem.snowball import SnowballStemmer
from scipy.sparse import hstack
from sklearn.svm import SVC

In [223]:
# Import test and train data
path = '/Users/zgalvin/Desktop/Kaggle Problem/train.csv'
pathTrain = '/Users/zgalvin/Desktop/Kaggle Problem/test.csv'

trainData = pd.read_csv(path, names=['ID', 'Text', 'Toxic','Severe_Toxic','Obscene', 'Threat', 'Insult', 'Identity_Hate'], low_memory = False, header=0)
testData = pd.read_csv(pathTrain, names=['ID','Text'],header=0,dtype={'ID': str})

trainData.head()
type(trainData)

pandas.core.frame.DataFrame

In [224]:
# Split the data up into X and y

# Start timer
start_time = time.time()

# Assign text to X and categories to y
X_train = trainData.Text
y_train = [trainData.Toxic, trainData.Severe_Toxic, trainData.Obscene, trainData.Threat, trainData.Insult, trainData.Identity_Hate]

# Assign text to X in the test set
X_test = testData.Text

elapsed_time = time.time() - start_time
print(elapsed_time)

0.007706165313720703


In [225]:
# Turn text to numbers

# Start timer
start_time = time.time()       

# Fit and transform X into tokenized word counts
count_vect = CountVectorizer(stop_words='english', min_df=.0002, max_df=.9)
X_train = count_vect.fit_transform(X_train)
# print(count_vect.get_feature_names())


# Implement term frequency–inverse document frequency on the word counts (this did not help accuracy)
# tf_transformer = TfidfTransformer()
# X_train = tf_transformer.fit_transform(X_train_counts)

elapsed_time = time.time() - start_time
print(elapsed_time)

9.60128402709961


In [226]:
# Transform X_test

# Start timer
start_time = time.time()

# Transform X into word counts
X_test = count_vect.transform(X_test)    

# Transform X in testing data througb frequency–inverse document frequency on the word counts (this did not help accuracy)
# X_test = tf_transformer.transform(X_test)

elapsed_time = time.time() - start_time
print(elapsed_time)

7.119053840637207


In [227]:
# Function to choose model so that editing models is easier (less copy and pasting)

# Start timer
start_time = time.time()

# Parameters: 
#     model - (str) The model you would like to train the categories on. 
#     y - (int) The category you would like to test (0-5)
# Returns:
#     mod - (Object) This function returns the instatiated model that was specified in the parameter 'model'
def chooseModel(model,y):
    if model == "nb":
        # use MultinomialNB
        mod = MultinomialNB()
        
        '''# Find optimal parameters
        params = dict(alpha=range(0,20))
        
        # Instiatiate and fit GridSearch
        grid = GridSearchCV(mb, params, random_state=4)
        grid.fit(X_train, y_train[y])
        
        # Print out result
        print(grid.best_params_)
        
        mod = MultinomialNB(alpha=grid.best_params_)
        print(elapsed_time)
        '''
        return mod
    elif model == "sgd":
        # use SGDClassifier 
        mod = SGDClassifier(loss='log')
        print(elapsed_time)
        
        '''# Define parameters to optimize
        params = dict(loss=[penalty=['none','l2','l1','elasticnet'], \
                            alpha=(1e-2, 1e-3), fit_intercept=[True,False]  
                     )
        
        grid = GridSearchCV(sgd, params, n_iter=5, random_state=5)
        rand.fit(X_train, y_train[y])
        
        # Print out the optimal parameters 
        print(rand.best_params_)
        
        # Instatiate new model with optimized parameters
        mod = SGDClassifier(loss=rand.best_params_['loss'], penalty=rand.best_params_['penalty'])
        '''
        return mod
    elif model == "randTree":
        # Use RandomForestClassifier
        mod = RandomForestClassifier(n_estimators=10)
        print(elapsed_time)
        return mod
    elif model == "decTree":
        # Use DecisionTreeClassifier
        mod = DecisionTreeClassifier()
        print(elapsed_time)
        return mod
    elif model == "svm":
        mod = SVC()
        print(elapsed_time)
        return mod
    else:
        print("Sorry! We have not implemented that model yet.")


In [228]:
# Function that allow you to test each category (Again, less copy and pasting)

# Start timer
start_time = time.time()

# Parameters: 
#     modelToUse - (str) The model you would like to train the categories on. The possible models are implemented in the 
#                  chooseModel() function above.
# Returns:
#     data - (pandas.core.frame.DataFrame) This function returns a data frame of the ID and probabilties of toxic, severly
# .          obscene, threat, insult and identity hate for each comment or text.
def checkAllCategories(modelToUse):
    # Identity_Hate
    mlb = chooseModel(model=modelToUse,y=5).fit(X_train, y_train[5])
    predicted_Identity_Hate = mlb.predict_proba(X_test)
    predicted_Identity_Hate = pd.Series(predicted_Identity_Hate[:,1])

    # Insult
    mlb = chooseModel(model=modelToUse,y=4).fit(X_train, y_train[4])
    predicted_Insult = mlb.predict_proba(X_test)
    predicted_Insult = pd.Series(predicted_Insult[:,1])

    # Threat
    mlb = chooseModel(model=modelToUse,y=3).fit(X_train, y_train[3])
    predicted_Threat = mlb.predict_proba(X_test)
    predicted_Threat = pd.Series(predicted_Threat[:,1])

    # Obscene
    mlb = chooseModel(model=modelToUse,y=2).fit(X_train, y_train[2])
    predicted_Obscene = mlb.predict_proba(X_test)
    predicted_Obscene = pd.Series(predicted_Obscene[:,1])

    # Severly Toxic
    mlb = chooseModel(model=modelToUse,y=1).fit(X_train, y_train[1])
    predicted_Severly_Toxic = mlb.predict_proba(X_test)
    predicted_Severly_Toxic = pd.Series(predicted_Severly_Toxic[:,1])

    # Toxic
    mlb = chooseModel(model=modelToUse,y=0).fit(X_train, y_train[0])
    predicted_Toxic = mlb.predict_proba(X_test)
    predicted_Toxic = pd.Series(predicted_Toxic[:,1])
    '''
    cols = ['predicted_Toxic','predicted_Severely_Toxic','predicted_Obscene','predicted_Threat','predicted_insult','predicted_Identity_Hate']
    for i in range(len(y_train)):
        mlb = MultinomialNB().fit(X_train, y_train[i])
        # Transform X and then predict
        if i == 0:
            X_test = count_vect.transform(X_test)
            X_test = tf_transformer.transform(X_test)
        cols[i] = pd.Series(mlb.predict(X_test))
    '''
    
    data = pd.concat([testData.ID,predicted_Toxic,predicted_Severly_Toxic,predicted_Obscene,predicted_Threat,predicted_Insult,predicted_Identity_Hate], axis=1)
    return data
    
elapsed_time = time.time() - start_time
print(elapsed_time)

0.00024175643920898438


In [229]:
# Write data to csv file in Kaggle format

# Start timer
start_time = time.time()

# Create list of column names (How Kaggle wants it)
columnNames = 'id','toxic','severe_toxic','obscene', 'threat', 'insult', 'identity_hate'

# Concat the pandas series together to create a data frame
data = checkAllCategories(modelToUse="nb")

# Write data frame to the result.csv on desktop
data.to_csv(path_or_buf='/Users/zgalvin/Desktop/results.csv', index=False, header=columnNames)

elapsed_time = time.time() - start_time
print(elapsed_time)


2.4055190086364746
