## modeling

Use different models to find out the best model for tell which subreddit the post belong to

In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
#from sklearn import metrics
#from textblob import TextBlob, Word

#%matplotlib inline 

In [79]:
# import cleaned comments data from csv
df_comments = pd.read_csv('./Data/comments.csv', index_col=0)
df_comments.head()

Unnamed: 0,body,is_college
0,There’s 200 seat in that class rn and literall...,1
1,"I meant whatever your intended major is, try t...",1
2,Dude did you even apply to any safeties/target...,1
3,that is SO awful :(( what in the world?? I jus...,1
4,I have to wait till 9 pm for when my mom get h...,1


### Set X and y

In [81]:
X = df_comments['body']
y = df_comments['is_college']

y.value_counts(normalize=True)




0    0.506703
1    0.493297
Name: is_college, dtype: float64


The classes are balanced, each is approx 50%

Split data into train set and test set

In [82]:
#default test size

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [83]:
# edit stop words
from sklearn.feature_extraction import text

text.ENGLISH_STOP_WORDS

add_stop_words = ['did', 'doe', 'don','just', 'doesn', 'getting', 'going', 'got', 'ha', 'isn', 'wa', 've', 'll']

stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

In [84]:
stop_words

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

### Model:  CountVectorizer and LogisticRegression
                        

In [85]:
# using standard stop words = "english"
# using CountVectorizer and LogisticRegression

cvec = CountVectorizer(stop_words = "english")
                       
lr = LogisticRegressionCV(solver = 'liblinear')

pipe = Pipeline([
    ('cvec', cvec),
    ('lr', lr)
])

In [86]:
# baseline Logistic model

pipe.fit(X_train, y_train)
print('train score:', pipe.score(X_train, y_train))
print('test score:', pipe.score(X_test, y_test))

train score: 0.9587345254470426
test score: 0.875


### model: CountVectorizer and LogisticRegression
with additional stop words

In [87]:
# using additional stop words
# using CountVectorizer and LogisticRegression

cvec = CountVectorizer(stop_words = stop_words)

lr = LogisticRegressionCV(solver = 'liblinear') 

pipe = Pipeline([
    ('cvec', cvec),
    ('lr', lr)
])

In [88]:
pipe.fit(X_train, y_train)
print('train score:', pipe.score(X_train, y_train))
print('test score:', pipe.score(X_test, y_test))

train score: 0.9679046309032554
test score: 0.875


### Model: CountVectorizer and RandomForestClassifier

In [94]:
# using standard stop words = "english"
# using CountVectorizer and random forest

cvec = CountVectorizer( stop_words = stop_words, 
                         max_features = 10000)
rf = RandomForestClassifier(random_state = 42)

pipe = Pipeline([
    ('cvec', cvec),
    ('rf', rf)
])

pipe.fit(X_train, y_train)
print('train score:', pipe.score(X_train, y_train))
print('test score:', pipe.score(X_test, y_test))

train score: 0.9917469050894085
test score: 0.8255494505494505


### Model: TfidfVectorizer and LogisticRegressionCV

In [97]:
# tfidf using "english" stop words, 

tfidf = TfidfVectorizer( stop_words = stop_words, 
                     max_features = 10000)

lr = LogisticRegressionCV(solver = 'liblinear') 

pipe = Pipeline([
    ('tf', tfidf),
    ('lr', lr)
])

In [98]:
pipe.fit(X_train, y_train)
print('train score:', pipe.score(X_train, y_train))
print('test score:', pipe.score(X_test, y_test))

train score: 0.974782209995415
test score: 0.8873626373626373


### Model：TfidfVectorizer with RandomForestClassifier

In [99]:
tfidf = TfidfVectorizer(analyzer = "word", 
                     stop_words = stop_words, 
                     max_features = 10000)

rf = RandomForestClassifier(random_state = 42)

pipe = Pipeline([
    ('tf', tfidf),
    ('rf', rf)
])

pipe.fit(X_train, y_train)
print('train score:', pipe.score(X_train, y_train))
print('test score:', pipe.score(X_test, y_test))

train score: 0.9917469050894085
test score: 0.8214285714285714


### Model: CountVectorizer and MultinomialNB

In [100]:
# CountVectorizer and MultinomialNB

pipe_cv_mnb = Pipeline([
    ('cv', CountVectorizer(stop_words = stop_words)),
    ('mnb', MultinomialNB())
])

pipe_cv_mnb.fit(X_train, y_train)
pipe_cv_mnb.score(X_train, y_train), pipe_cv_mnb.score(X_test, y_test)

(0.9504814305364512, 0.9107142857142857)

### Model: TfidfVectorizer and MultinomialNB

In [103]:
pipe_tf_mnb = Pipeline([
    ('tf', TfidfVectorizer(stop_words = stop_words)),
    ('mnb', MultinomialNB())
])

pipe_tf_mnb.fit(X_train, y_train)
print(pipe_tf_mnb.score(X_train, y_train), pipe_tf_mnb.score(X_test, y_test))


0.9637780834479597 0.9107142857142857


### Model compare

CountVectorizer and LogisticRegression
train score: 0.9679046309032554test score: 0.875

CountVectorizer and RandomForestClassifie
train score: 0.9917469050894085test score: 0.8255494505494505

TfidfVectorizer and LogisticRegressionCV
train score: 0.974782209995415    test score: 0.8873626373626373

TfidfVectorizer with RandomForestClassifier
train score: 0.9917469050894085   test score: 0.8214285714285714

CountVectorizer and MultinomialNB
(0.9504814305364512, 0.9107142857142857)

TfidfVectorizer and MultinomialNB
0.9637780834479597 0.9107142857142857

**models wiht MultinomialNB perform best on test dataset**
