# Modeling Rd. 3
---
This round I am using the subsets of the original data set that yielded the best results from the previous step of modeling. Specifically, the Logistic Regression performed on the 'heavy class' subset, as well as, the Logistic Regression performed on the 'upper class' subset. These two datasets will be imported and run through ridge regularization to reduce the overfitting that is exhibited in the results.

## Model Tuning

## Imports

In [1]:
# Data Analysis
import pandas as pd
import numpy as np

# Text Processing
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords

# Machine Learning packages
from sklearn.feature_extraction.text import CountVectorizer

# Model training and evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV
#from sklearn.linear_model import LassoClassifier, LassoClassifierCV


import pickle
import warnings
warnings.filterwarnings("ignore")

### Load in data

In [28]:
df = pd.read_csv('../data/heavy_sample.csv')
df.head()

Unnamed: 0,type,posts,no. of. words,class
0,INFJ,enfp intj moments sportscenter plays...,344,0
1,INTP,good course which know thats bles...,215,3
2,INTJ,dear intp enjoyed conversation other e...,611,2
3,INTJ,science perfect scientist claims tha...,189,2
4,INFJ,cant draw nails haha those were done pr...,775,0


In [9]:
upper_df = pd.read_csv('../data/upper_sample.csv')
upper_df.head()

Unnamed: 0,type,posts,no. of. words,class
0,ENTP,finding lack these posts very alarming ...,639,1
1,ENFP,doesnt want trip without staying behin...,305,0
2,ISFP,they paint without numbers guess istp ...,492,2
3,ENFP,enfps posted this thread philosophy board...,820,0
4,ISTP,from what read about enneagram thoug...,849,3


## Normal Logistic regression
---
### Heavy Sample

In [4]:
# Preparing posts for model by vectorzing and filtering stop-words

cvec = CountVectorizer(stop_words='english')

X = cvec.fit_transform(df['posts'])

In [35]:
# Save to file in the current working directory
pkl_filename = "../data/pickle_vectorizer.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(cvec, file)

In [5]:
# Defining y (target feature)
y = df['class']

In [6]:
# Train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size=0.2, stratify=y, random_state=42)

print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

(4460, 79517) (4460,) (1115, 79517) (1115,)


In [7]:
# Instantiate the model
logreg = LogisticRegression()

# Fit the model on the training data
logreg.fit(X_train, y_train)

# Make predictions
preds = logreg.predict(X_test)

# Model evaluation
report = classification_report(y_test, preds)

print(report)

              precision    recall  f1-score   support

           0       0.71      0.68      0.70       287
           1       0.72      0.78      0.75       356
           2       0.72      0.65      0.68       215
           3       0.70      0.70      0.70       257

    accuracy                           0.71      1115
   macro avg       0.71      0.71      0.71      1115
weighted avg       0.71      0.71      0.71      1115



In [8]:
logreg_train = logreg.score(X_train, y_train)

logreg_test = logreg.score(X_test, y_test)

In [9]:
print(f'Training Accuracy:  {logreg_train}')
print(f'Testing Accuracy:  {logreg_test}')

Training Accuracy:  0.862780269058296
Testing Accuracy:  0.7139013452914799


In [33]:
X_test.shape

(1115, 79517)

## Normal Logistic Regression
---
### Upper Sample

In [10]:
# Preparing posts for model by vectorzing and filtering stop-words

cvec = CountVectorizer(stop_words='english')

X = cvec.fit_transform(upper_df['posts'])

# Defining y (target feature)
y = upper_df['class']


In [11]:
# Train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size=0.2, stratify=y, random_state=42)

print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

(1515, 40908) (1515,) (379, 40908) (379,)


In [12]:
# Instantiate the model
logreg = LogisticRegression()

# Fit the model on the training data
logreg.fit(X_train, y_train)

# Make predictions
preds = logreg.predict(X_test)

# Model evaluation
report = classification_report(y_test, preds)

print(report)

              precision    recall  f1-score   support

           0       0.73      0.89      0.80       127
           1       0.82      0.80      0.81       133
           2       0.74      0.55      0.63        53
           3       0.80      0.68      0.74        66

    accuracy                           0.78       379
   macro avg       0.78      0.73      0.75       379
weighted avg       0.78      0.78      0.77       379



In [13]:
logreg_train = logreg.score(X_train, y_train)

logreg_test = logreg.score(X_test, y_test)

In [14]:
print(f'Training Accuracy:  {logreg_train}')
print(f'Testing Accuracy:  {logreg_test}')

Training Accuracy:  1.0
Testing Accuracy:  0.7757255936675461


## Ridge Regularization
---
### Heavy Sample

In [None]:
# Preparing posts for model by vectorzing and filtering stop-words

cvec = CountVectorizer(stop_words='english')

X = cvec.fit_transform(df['posts'])

# Defining y (target feature)
y = df['class']

In [None]:
# Train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size=0.2, stratify=y, random_state=42)

print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

In [22]:
# Instantiate.

ridge_model = RidgeClassifier(900) #played around with alpha manually for a bit to gauge where it wanted to settle at

# Fit.

ridge_model.fit(X_train, y_train)

# Make predictions
preds = ridge_model.predict(X_test)

# Model evaluation
report = classification_report(y_test, preds)

print(report)

              precision    recall  f1-score   support

           0       0.78      0.64      0.70       287
           1       0.65      0.87      0.74       356
           2       0.78      0.59      0.67       215
           3       0.74      0.68      0.71       257

    accuracy                           0.71      1115
   macro avg       0.74      0.69      0.71      1115
weighted avg       0.73      0.71      0.71      1115



In [23]:
ridge_train = ridge_model.score(X_train, y_train)
ridge_test = ridge_model.score(X_test, y_test)

In [24]:
print(f'Training Accuracy:  {ridge_train}')
print(f'Testing Accuracy:  {ridge_test}')

Training Accuracy:  0.8798206278026905
Testing Accuracy:  0.7130044843049327


### RidgeClassifierCV
---
Finds best value of alpha and runs the model

In [13]:
%%time
# Set up a list of ridge alphas to check.
# np.logspace generates 100 values equally between 0 and 5,
# then converts them to alphas between 10^0 and 10^5.

r_alphas = np.logspace(0,5,100)

# Cross-validate over our list of ridge alphas.

ridge_cv = RidgeClassifierCV(alphas=r_alphas, scoring ='r2', cv=5)

# Fit model using best ridge alpha!

ridge_cv.fit(X_train, y_train)



CPU times: user 1h 2min 43s, sys: 6min 35s, total: 1h 9min 19s
Wall time: 43min 16s


RidgeClassifierCV(alphas=array([1.00000000e+00, 1.12332403e+00, 1.26185688e+00, 1.41747416e+00,
       1.59228279e+00, 1.78864953e+00, 2.00923300e+00, 2.25701972e+00,
       2.53536449e+00, 2.84803587e+00, 3.19926714e+00, 3.59381366e+00,
       4.03701726e+00, 4.53487851e+00, 5.09413801e+00, 5.72236766e+00,
       6.42807312e+00, 7.22080902e+00, 8.11130831e+00, 9.11162756e+00,
       1.02353102e+01, 1.1...
       6.89261210e+03, 7.74263683e+03, 8.69749003e+03, 9.77009957e+03,
       1.09749877e+04, 1.23284674e+04, 1.38488637e+04, 1.55567614e+04,
       1.74752840e+04, 1.96304065e+04, 2.20513074e+04, 2.47707636e+04,
       2.78255940e+04, 3.12571585e+04, 3.51119173e+04, 3.94420606e+04,
       4.43062146e+04, 4.97702356e+04, 5.59081018e+04, 6.28029144e+04,
       7.05480231e+04, 7.92482898e+04, 8.90215085e+04, 1.00000000e+05]),
                  cv=5, scoring='r2')

In [14]:
# Here is the optimal value of alpha

ridge_cv.alpha_

2154.4346900318847

In [19]:
print(ridge_cv.score(X_train, y_train))
print(ridge_cv.score(X_test, y_test))

0.8271300448430493
0.7165919282511211


In [31]:
# Saving best model to deploy on streamlit app

pkl_filename = "../data/pickle_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(ridge_cv, file)

In [29]:
# Making sure I can reload model

with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)

In [30]:
# Ensuring model yields same results

print(pickle_model.score(X_train, y_train))
print(pickle_model.score(X_test, y_test))

0.8271300448430493
0.7165919282511211


## Ridge Regularization
---
### Upper Sample

In [15]:
# Preparing posts for model by vectorzing and filtering stop-words

cvec = CountVectorizer(stop_words='english')

X = cvec.fit_transform(upper_df['posts'])

# Defining y (target feature)
y = upper_df['class']


In [16]:
# Train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size=0.2, stratify=y, random_state=42)

print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

(1515, 40908) (1515,) (379, 40908) (379,)


In [17]:
# Instantiate.

ridge_model = RidgeClassifier(900)

# Fit.

ridge_model.fit(X_train, y_train)

# Make predictions
preds = ridge_model.predict(X_test)

# Model evaluation
report = classification_report(y_test, preds)

print(report)

              precision    recall  f1-score   support

           0       0.65      0.90      0.75       127
           1       0.80      0.80      0.80       133
           2       0.85      0.42      0.56        53
           3       0.89      0.59      0.71        66

    accuracy                           0.74       379
   macro avg       0.79      0.68      0.70       379
weighted avg       0.77      0.74      0.73       379



In [18]:
ridge_train = ridge_model.score(X_train, y_train)
ridge_test = ridge_model.score(X_test, y_test)

In [19]:
print(f'Training Accuracy:  {ridge_train}')
print(f'Testing Accuracy:  {ridge_test}')

Training Accuracy:  0.899009900990099
Testing Accuracy:  0.741424802110818


### RidgeClassifierCV
---


In [20]:
%%time
# Set up a list of ridge alphas to check.
# np.logspace generates 100 values equally between 0 and 5,
# then converts them to alphas between 10^0 and 10^5.

r_alphas = np.logspace(0,5,100)

# Cross-validate over our list of ridge alphas.

ridge_cv = RidgeClassifierCV(alphas=r_alphas, scoring ='r2', cv=5)

# Fit model using best ridge alpha!

ridge_cv.fit(X_train, y_train)



CPU times: user 8min 9s, sys: 53.7 s, total: 9min 3s
Wall time: 6min 54s


RidgeClassifierCV(alphas=array([1.00000000e+00, 1.12332403e+00, 1.26185688e+00, 1.41747416e+00,
       1.59228279e+00, 1.78864953e+00, 2.00923300e+00, 2.25701972e+00,
       2.53536449e+00, 2.84803587e+00, 3.19926714e+00, 3.59381366e+00,
       4.03701726e+00, 4.53487851e+00, 5.09413801e+00, 5.72236766e+00,
       6.42807312e+00, 7.22080902e+00, 8.11130831e+00, 9.11162756e+00,
       1.02353102e+01, 1.1...
       6.89261210e+03, 7.74263683e+03, 8.69749003e+03, 9.77009957e+03,
       1.09749877e+04, 1.23284674e+04, 1.38488637e+04, 1.55567614e+04,
       1.74752840e+04, 1.96304065e+04, 2.20513074e+04, 2.47707636e+04,
       2.78255940e+04, 3.12571585e+04, 3.51119173e+04, 3.94420606e+04,
       4.43062146e+04, 4.97702356e+04, 5.59081018e+04, 6.28029144e+04,
       7.05480231e+04, 7.92482898e+04, 8.90215085e+04, 1.00000000e+05]),
                  cv=5, scoring='r2')

In [21]:
# Here is the optimal value of alpha

ridge_cv.alpha_

599.4842503189409

In [22]:
print(ridge_cv.score(X_train, y_train))
print(ridge_cv.score(X_test, y_test))

0.9174917491749175
0.7440633245382586
