# Preprocessing and Hyperparameter Tuning

This notebook prepares our dataset for model hyperparameter tuning. After splitting data into training and testing sets, we clean our text for count vectorization and TF-IDF transformations. Using GridSearch, we determine which models and hyperparameters will be best for classification. 

In [103]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction.text  import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from nltk import RegexpTokenizer, WordNetLemmatizer
from nltk.corpus import stopwords

### Import data and train-test-split

In [65]:
df = pd.read_csv('../data/data_final.csv')

In [66]:
df.head()

Unnamed: 0,text,target
0,/r/BravoRealHousewives daily OT thread. Today ...,1
1,The Real Housewives of New Jersey S09E07 - Bru...,1
2,If we could pool our money and hire MKE to do ...,1
3,Gotta pay for that wedding somehow but holy Fa...,1
4,RHONJ Season 9 Midseason Trailer,1


In [67]:
df.tail()

Unnamed: 0,text,target
3599,His fresh fade has evolved.,0
3600,Ron Baker The Virginity Taker,0
3601,Ahhh the infamous karma whore. Probably hops i...,0
3602,Fuck Pacers have McDermott... RIP raptors,0
3603,I swear to God --- That'd be epic af!,0


In [91]:
df['target'].value_counts(normalize=True)

1    0.514151
0    0.485849
Name: target, dtype: float64

In [72]:
X = df[['text']]
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state=42)

### Instatiate lemmatizer, tokenizer, list of stop words, and a function to clean our text data.

In [69]:
#prepare CountVectorizer
#instatiate lemmatizer and tokenizer
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer('\w+')

#create set of stopwords from sklearn and add more words
stops = set(stopwords.words('english'))
more_stops = ['link','xb','amp','r']
for w in more_stops:
    stops.add(w)

#function to clean text
def to_words(raw_text):
    #remove links
    raw_text = re.sub('http\S+', '', raw_text)
    raw_text = re.sub('www\S+', '', raw_text)
    #remove numbers
    raw_text = re.sub('\d+', '', raw_text)
    #tokenize
    words = tokenizer.tokenize(raw_text.lower())
    #remove stop words
    meaningful_words = [lemmatizer.lemmatize(w) for w in words if not w in stops]
    
    return (" ".join(meaningful_words))
#use our to_words function to create a list of texts for our training and testing set

# Initialize empty lists to hold the clean texts.
clean_train_text = []
clean_test_text = []

# Append clean texts to list.
for text in X_train['text']:
    clean_train_text.append(to_words(text))
for text in X_test['text']:
    clean_test_text.append(to_words(text))

In [75]:
clean_test_text[:5]

['rhoa official intro new taglines released',
 'southern charm lady',
 'nikola jokic flex felipe eichenberger denver nugget head strength coach',
 'totally',
 'housewife moment life chill part maybe used drama thursday last met hw moment long interestingly explained got thinking though hw worthy moment last year']

In [76]:
clean_train_text[:5]

['jokic realize swaggy p team right',
 'bravorealhousewives daily ot thread today november daily thread topic discussion',
 'nba shoud introduce hypermax contract basically instead measly cap supermax hypermax would team cap mean player could potentially receive million year nba current million cap would team always gonna front office either desperate enough dumb enough shell money player even mentioned sentence max contract',
 'told someone attended party like sign outside house establishment arriving guest consent form sometimes non disclosure agreement given guest completed enter party venue',
 'umm want emily back know anti tamra']

### Tune models with CountVectorizer dataframe
- Logistic Regression, Random Forest, AdaBoost, Gradient Boost, and NaiveBayes Multinomial

In [80]:
cv = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   
                             max_features = 3000, ngram_range=(1,3)) 

train_cv = cv.fit_transform(clean_train_text)
test_cv = cv.transform(clean_test_text)

# Numpy arrays are easy to work with, so convert the result to an array
train_cv = train_cv.toarray()
test_cv = test_cv.toarray()
X_train_cv = pd.DataFrame(train_cv,columns=cv.get_feature_names(),index=y_train)
X_test_cv = pd.DataFrame(test_cv,columns=cv.get_feature_names(),index=y_test)

In [81]:
X_train_cv.head()

Unnamed: 0_level_0,aaron,aaron fox,ability,able,absolute,absolutely,abuse,accent,accident,accidentally,...,yet,yolanda,york,york knicks,young,young player,youtube,yr,zach,zero
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
lr = LogisticRegression()
lr_params = {
    'penalty': ['l2','l1'],
    'C': [.1, .5, .7, .9, .95, .99,1],
}

lr_gs = GridSearchCV(lr, lr_params, cv=5)
lr_gs.fit(X_train_cv,y_train)
print('LogisticRegression best score:', lr_gs.best_score_)
print('LogisticRegression test score:', lr_gs.score(X_test_cv,y_test))
print(lr_gs.best_estimator_)

train score 0.9499623777276147
test score 0.8692220969560316
LogisticRegression(C=0.9, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [21]:
rf = RandomForestClassifier(random_state=42)

rf_params = {
    'n_estimators' : [50,75,100,125,150],
    'max_depth' : [None,5,10,15],
}


rf_gs = GridSearchCV(rf, rf_params, cv=5)
rf_model = rf_gs.fit(X_train_cv,y_train)
print('RandomForest best score:', rf_gs.best_score_)
print('RandomForest test score:', rf_gs.score(X_test_cv,y_test))
print(rf_gs.best_estimator_)

train score 0.7840481565086531
best score 0.7791572610985703
test score 0.7463359639233371
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)


In [27]:
ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
ada_params = {
    'base_estimator__max_depth' : [1,2,3],
    'n_estimators' : [40,50,60],
}
ada_gs = GridSearchCV(ada, param_grid=ada_params, cv = 5)

ada_gs.fit(X_train_cv,y_train)
print('AdaBoost best score:', ada_gs.best_score_)
print('AdaBoost test score:', ada_gs.score(X_test_cv,y_test))
print(ada_gs.best_params_)

train score 0.8856282919488337
best score 0.8250564334085779
test score 0.798196166854566
{'base_estimator__max_depth': 3, 'n_estimators': 40}


In [29]:
gb = GradientBoostingClassifier()
gb_params = {
    'max_depth':[2,3,4],
    'learning_rate':[.1,.5,.9],
    'n_estimators':[90,100,110]
}
gb_gs = GridSearchCV(gb,param_grid=gb_params, cv=3)
gb_gs.fit(X_train_cv,y_train)
print('GradientBoost best score:', gb_gs.best_score_)
print('GradientBoost test score:', gb_gs.score(X_test_cv,y_test))
print(ada_gs.best_params_)

train score 0.9322799097065463
best score 0.8506395786305493
test score 0.8523111612175873
{'base_estimator__max_depth': 3, 'n_estimators': 60}


In [41]:
nb = MultinomialNB()
nb.fit(X_train_cv,y_train)
print('NaiveBayes Multinomial train score:',nb.score(X_train_cv,y_train))
print('NaiveBayes Multinomial CV score:',cross_val_score(nb,X_train_cv, y_train).mean())
print('NaiveBayes Multinomial test score:',nb.score(X_test_cv,y_test))


0.9025583145221971
0.8737316798196166
0.8927794064970609


### Tune models with CountVectorizer dataframe

In [74]:
tv = TfidfVectorizer(analyzer = "word",
                     tokenizer = None,
                     preprocessor = None,
                     stop_words = None, 
                     max_features = 3000, ngram_range=(1,3))

train_tv = tv.fit_transform(clean_train_text)
test_tv = tv.transform(clean_test_text)

# Numpy arrays are easy to work with, so convert the result to an array
train_tv = train_tv.toarray()
test_tv = test_tv.toarray()
X_train_tv = pd.DataFrame(train_tv,columns=tv.get_feature_names())
X_test_tv = pd.DataFrame(test_tv,columns=tv.get_feature_names())

In [20]:
lr = LogisticRegression()

lr_params = {
    'penalty': ['l2','l1'],
    'C': [.1, .5, .7, .9, .95, .99,1],
}

lr_gs = GridSearchCV(lr, lr_params, cv=5)
lr_gs.fit(X_train_tv,y_train)
print('LogisticRegression best score:', lr_gs.best_score_(X_train_tv,y_train))
print('LogisticRegression test score:', lr_gs.score(X_test_tv,y_test))
print(lr_gs.best_estimator_)

train score 0.9371708051166291
test score 0.8782412626832018
LogisticRegression(C=0.9, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [22]:
rf = RandomForestClassifier(random_state=42)

rf_params = {
    'n_estimators' : [50,75,100,125,150],
    'max_depth' : [None,5,10,15],
}


rf_gs = GridSearchCV(rf, rf_params, cv=5)
rf_gs.fit(X_train_tv,y_train)
print('RandomForset best score:', rf_gs.best_score_)
print('RandomForest test score:', rf_gs.score(X_test_tv,y_test))
print(rf_gs.best_estimator_)

train score 0.7840481565086531
best score 0.7746425884123401
test score 0.7497181510710259
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=75, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)


In [28]:
ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
ada_params = {
    'base_estimator__max_depth' : [1,2,3],
    'n_estimators' : [40,50,60],
}
ada_gs = GridSearchCV(ada, param_grid=ada_params, cv = 5)

ada_gs.fit(X_train_tv,y_train)
print('AdaBoost best score:', ada_gs.best_score_)
print('AdaBoost test score:', ada_gs.score(X_test_tv,y_test))
print(ada_gs.best_params_)

train score 0.9341610233258089
best score 0.8227990970654627
test score 0.8060879368658399
{'base_estimator__max_depth': 3, 'n_estimators': 60}


In [30]:
gb = GradientBoostingClassifier()
gb_params = {
    'max_depth':[2,3,4],
    'learning_rate':[.1,.5,.9],
    'n_estimators':[90,100,110]
}
gb_gs = GridSearchCV(gb,param_grid=gb_params, cv=3)
gb_gs.fit(X_train_tv,y_train)
print('Gradient Boost best score:', gb_gs.best_score_)
print('Gradient Boost test score:', gb_gs.score(X_test_tv,y_test))
print(ada_gs.best_params_)

train score 0.9721595184349134
best score 0.8288186606471031
test score 0.8297632468996617
{'base_estimator__max_depth': 3, 'n_estimators': 60}


In [112]:
nb = MultinomialNB()
nb.fit(X_train_tv,y_train)
print('NaiveBayes Multinomial train score:',nb.score(X_train_tv,y_train))
print('NaiveBayes Multinomial CV score:',cross_val_score(nb,X_train_tv, y_train).mean())
print('NaiveBayes Multinomial test score:',nb.score(X_test_tv,y_test))

NaiveBayes Multinomial train score: 0.9130595634480208
NaiveBayes Multinomial CV score: 0.8879046996133703
NaiveBayes Multinomial test score: 0.8734739178690344
