In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def modeling_parameters(model):
    parameters = model.get_params()
    
    param_dict = { 'model' + '__' + key : [value]\
              for key,value in zip(parameters.keys(),parameters.values())
                 }
    return param_dict

def benchmark_model(gridsearch,X_train,X_test,y_train,y_test):
    
    print()
    
    print(f'Baseline Score:\n{y_test.value_counts(normalize=True)}')
    print()
    print(f'GridSearch Best Score: {gridsearch.best_score_}')
    print()
    print(f'GridSearch Best Parameters:')
    display(gridsearch.best_params_)
    print()
    print(f'GridSearch Training Score: {gridsearch.score(X_train, y_train)}')
    print()
    print(f'GridSearch Testing Score: {gridsearch.score(X_test, y_test)}')

In [3]:
df = pd.read_csv('data/modeling_aapl.csv')
df.set_index('Unnamed: 0',inplace=True)
df.index.rename('adjusted_date',inplace=True)

In [4]:
df['target'] = [0 if i <= 0 else 1 for i in df['Close']] 

In [5]:
X = df['text']
y = df['target']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=1)

In [7]:
y_test.value_counts(normalize=True)

1    0.590909
0    0.409091
Name: target, dtype: float64

In [8]:
model = RandomForestClassifier()

In [9]:
modeling_parameters(model)

{'model__bootstrap': [True],
 'model__ccp_alpha': [0.0],
 'model__class_weight': [None],
 'model__criterion': ['gini'],
 'model__max_depth': [None],
 'model__max_features': ['auto'],
 'model__max_leaf_nodes': [None],
 'model__max_samples': [None],
 'model__min_impurity_decrease': [0.0],
 'model__min_impurity_split': [None],
 'model__min_samples_leaf': [1],
 'model__min_samples_split': [2],
 'model__min_weight_fraction_leaf': [0.0],
 'model__n_estimators': [100],
 'model__n_jobs': [None],
 'model__oob_score': [False],
 'model__random_state': [None],
 'model__verbose': [0],
 'model__warm_start': [False]}

In [10]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('model', model)
])

pipe_params = {
    'cvec__max_features': [2_000, 3_000, 4_000, 5_000],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)],
    'model__bootstrap': [True],
 'model__ccp_alpha': [0.0],
 'model__class_weight': [None],
 'model__criterion': ['gini'],
 'model__max_depth': [None],
 'model__max_features': ['auto'],
 'model__max_leaf_nodes': [None],
 'model__max_samples': [None],
 'model__min_impurity_decrease': [0.0],
 'model__min_impurity_split': [None],
 'model__min_samples_leaf': [1],
 'model__min_samples_split': [2],
 'model__min_weight_fraction_leaf': [0.0],
 'model__n_estimators': [100],
 'model__n_jobs': [None],
 'model__oob_score': [False],
 'model__random_state': [None],
 'model__verbose': [0],
 'model__warm_start': [False]
}

gs = GridSearchCV(pipe, # what object are we optimizing?
                  param_grid=pipe_params, # what parameters values are we searching?
                  cv=5,
                 n_jobs=2,
                 verbose=2) # 5-fold cross-validation.

In [13]:
gs.fit(X_train, y_train);

In [12]:
benchmark_model(gs,X_train,X_test,y_train,y_test)

Baseline Score:
1    0.590909
0    0.409091
Name: target, dtype: float64

GridSearch Best Score: 0.6730769230769231

GridSearch Best Parameters:


{'cvec__max_df': 0.95,
 'cvec__max_features': 2000,
 'cvec__min_df': 3,
 'cvec__ngram_range': (1, 1),
 'model__bootstrap': True,
 'model__ccp_alpha': 0.0,
 'model__class_weight': None,
 'model__criterion': 'gini',
 'model__max_depth': None,
 'model__max_features': 'auto',
 'model__max_leaf_nodes': None,
 'model__max_samples': None,
 'model__min_impurity_decrease': 0.0,
 'model__min_impurity_split': None,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__min_weight_fraction_leaf': 0.0,
 'model__n_estimators': 100,
 'model__n_jobs': None,
 'model__oob_score': False,
 'model__random_state': None,
 'model__verbose': 0,
 'model__warm_start': False}


GridSearch Training Score: 1.0

GridSearch Testing Score: 0.7272727272727273


In [18]:
gs.estimator

Pipeline(steps=[('cvec', CountVectorizer()),
                ('model', RandomForestClassifier())])