In [10]:
import pandas as pd

import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [2]:
count_vect_df = pd.read_csv('./datasets/count_vect_df.csv')

In [23]:
X = count_vect_df.drop(['subreddit', 'title'], axis = 1)
y = count_vect_df['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = .5, random_state = 42)

### Model Selection

#### I chose logistic regression and naive bayesian classifiers because they tend to be high bias models that will cut down on the inherent variance of natural language data

#### I am using Random Forest Classifier because it will show if there are data pieces that are overrepresentative of the dataset that I should consider removing

In [4]:
logreg = LogisticRegression()
random_forest = RandomForestClassifier()
naivebay = GaussianNB()

In [5]:
naivebay.fit(X_train, y_train)
logreg.fit(X_train, y_train)
random_forest.fit(X_train, y_train)

RandomForestClassifier()

In [6]:
print(f'GaussianNB train score             = {naivebay.score(X_train, y_train)}')
print(f'LogisticRegression train score     = {logreg.score(X_train, y_train)}')
print(f'RandomForestClassifier train score = {random_forest.score(X_train, y_train)}')

GaussianNB train score             = 0.99745
LogisticRegression train score     = 0.9992
RandomForestClassifier train score = 1.0


In [7]:
print(f'GaussianNB test score             = {naivebay.score(X_test, y_test)}')
print(f'LogisticRegression test score     = {logreg.score(X_test, y_test)}')
print(f'RandomForestClassifier test score = {random_forest.score(X_test, y_test)}')

GaussianNB test score             = 0.99645
LogisticRegression test score     = 0.99855
RandomForestClassifier test score = 1.0


In [9]:
print(f'GaussianNB cross validation score             = {cross_val_score(naivebay, X, y, cv = 10).mean()}')
print(f'LogisticRegression cross validation score     = {cross_val_score(logreg, X, y, cv = 10).mean()}')
print(f'RandomForestClassifier cross validation score = {cross_val_score(random_forest, X, y, cv = 10).mean()}')

GaussianNB cross validation score             = 0.99695
LogisticRegression cross validation score     = 0.998875
RandomForestClassifier cross validation score = 0.9997


### Looking at relevant data from the models 

#### pulling the coeffients from the logistic regression model will tell us what words have the highest level of importance in deciding question type from our dataset

In [166]:
logregfullset = LogisticRegression()
logregfullset.fit(X, y)
feature_importance_fullset = logregfullset.coef_
feature_importance = logreg.coef_

In [167]:
logreg_importance_fullset = pd.DataFrame([X.columns], columns =  logregfullset.coef_.tolist())
logreg_importance = pd.DataFrame([X.columns], columns =  logreg.coef_.tolist())

In [168]:
fullset_feature_importance_df = logreg_importance_fullset.T
feature_importance_df = logreg_importance.T

In [169]:
fullset_feature_importance_df = fullset_feature_importance_df.reset_index()
feature_importance_df = feature_importance_df.reset_index()

In [170]:
fullset_feature_importance_df.columns  = ['importance', 'word']
feature_importance_df.columns = ['importance', 'word']

In [174]:
fullset_feature_importance_df.sort_values('importance', ascending = False).head(10)

Unnamed: 0,importance,word
930,4.587068,wanting
549,4.28953,mom
355,3.994833,girlfriend
558,3.325621,moving
52,3.077328,arsehole
105,3.065627,boyfriend
561,2.966193,mum
306,2.96143,father
852,2.945551,telling
110,2.871788,broke


In [103]:
feature_importance_df.sort_values('importance', ascending = False).head(10)

Unnamed: 0,importance,word
930,4.047134,wanting
549,3.773767,mom
355,3.453443,girlfriend
558,2.903757,moving
852,2.698402,telling
52,2.671698,arsehole
105,2.641415,boyfriend
306,2.610589,father
561,2.532691,mum
110,2.456468,broke


In [176]:
fullset_sum_df = pd.DataFrame(X.sum(axis = 0), index = None, columns = ['word_counts'])
sum_df = pd.DataFrame(X_train.sum(axis = 0), index = None, columns = ['word_counts'])

In [179]:
y = fullset_sum_df.reset_index()
x = sum_df.reset_index()

Unnamed: 0,index,word_counts
0,sentiment,-1656.678
1,char_count,0.0
2,word_count,3.183231e-12
3,able,200.0
4,abroad,100.0


In [181]:
x.head()

Unnamed: 0,index,word_counts
0,sentiment,-862.635
1,char_count,15.755457
2,word_count,17.663536
3,able,105.0
4,abroad,54.0


In [182]:
fullset_feature_importance_df['word_counts'] = y['word_counts']
feature_importance_df['word_counts'] = x['word_counts']

In [184]:
fullset_feature_importance_df.sort_values('word_counts', ascending = False).head(20)

Unnamed: 0,importance,word,word_counts
339,1.310118,friend,2994.0
253,-3.600541,doe,2213.0
930,4.587068,wanting,2200.0
549,4.28953,mom,1800.0
947,-3.536771,whats,1661.0
928,-0.394902,want,1588.0
923,-1.739476,wa,1509.0
618,-3.202969,people,1492.0
355,3.994833,girlfriend,1388.0
258,0.742538,dont,1388.0


In [160]:
feature_importance_df.sort_values('word_counts', ascending = False).head(20)

Unnamed: 0,importance,word,word_counts
339,1.541761,friend,1492.0
930,4.047134,wanting,1114.0
253,-3.331638,doe,1108.0
549,3.773767,mom,953.0
947,-3.208632,whats,844.0
928,-0.210519,want,777.0
923,-1.514515,wa,773.0
618,-2.820169,people,712.0
502,-3.304709,like,674.0
258,0.70664,dont,670.0


In [187]:
fullset_feature_importance_df.to_csv("./datasets/fullset_feature_importance_df" + '.csv', index = False)
feature_importance_df.to_csv("./datasets/feature_importance_df" + '.csv', index = False)