# Modeling Rd. 2
---
This round I wanted to subset my data into four distinct categories ordered by number of posts.

## Imports

In [26]:
# Data Analysis
import pandas as pd

# Text Processing
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords

# Machine Learning packages
from sklearn.feature_extraction.text import CountVectorizer

# Model training and evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

### Load in the data

In [27]:
df = pd.read_csv('../data/clean_mbti_df.csv')
df.head()

Unnamed: 0,type,posts,no. of. words
0,INFJ,enfp intj moments sportscenter plays...,344
1,ENTP,finding lack these posts very alarming ...,639
2,INTP,good course which know thats bles...,215
3,INTJ,dear intp enjoyed conversation other e...,611
4,ENTJ,youre fired thats another silly misconcepti...,315


In [28]:
df['type'].value_counts(normalize=True)

INFP    0.209578
INFJ    0.169867
INTP    0.151849
INTJ    0.127193
ENTP    0.078947
ENFP    0.075273
ISTP    0.038762
ISFP    0.031532
ENTJ    0.026671
ISTJ    0.023945
ENFJ    0.021219
ISFJ    0.019440
ESTP    0.010550
ESFP    0.005571
ESFJ    0.004979
ESTJ    0.004623
Name: type, dtype: float64

As you can see above, nearly half of our data is represented in the top four classes starting with the most prevalent class, 'INFP'. I plan to run a another round of modeling splitting the types category into groups of four in descending order of number of posts. I will split the categories into a heavy class, upper class, lower class, and finally a light class to extract more insight from this dataset.

As I said before, I chose not to impute these values to have balanced classes so this will serve as a work around.

### Heavy Class Modeling
---
Heavy Class will include the top four categories with most the data ('INFP', 'INFJ', 'INTP', 'INTJ')

In [29]:
# Use this bit of code to prevent long error messages, but be mindful that you may not see a message worth looking into when debugging code

import warnings
warnings.filterwarnings("ignore")

In [30]:
# Subsetting the heavy class

heavy_class = ['INFP','INFJ','INTP', 'INTJ']

heavy_sample = df[df['type'].isin(heavy_class)]

print(heavy_sample.type.unique())

print(heavy_sample.shape)

['INFJ' 'INTP' 'INTJ' 'INFP']
(5555, 3)


In [31]:
# Binarizing MBTI personality types(target feature) using LabelEncoder

encoder = LabelEncoder()
heavy_sample['class'] = encoder.fit_transform(heavy_sample['type'])

# Defining y (target feature)
y = heavy_sample['class']

In [32]:
# Making sure that label encoding successully binarized type column

heavy_sample.head()

Unnamed: 0,type,posts,no. of. words,class
0,INFJ,enfp intj moments sportscenter plays...,344,0
2,INTP,good course which know thats bles...,215,3
3,INTJ,dear intp enjoyed conversation other e...,611,2
5,INTJ,science perfect scientist claims tha...,189,2
6,INFJ,cant draw nails haha those were done pr...,775,0


In [19]:
print(heavy_sample['class'].unique())

[0 3 2 1]


Great! Now there are 4 target features!

In [51]:
heavy_sample['type'].value_counts(normalize=True)

INFP    0.318924
INFJ    0.257758
INTP    0.230493
INTJ    0.192825
Name: type, dtype: float64

In [35]:
# pre-vectorized shape
pre_vect = heavy_sample.shape

In [52]:
# Preparing posts for model by vectorzing and filtering stop-words

cvec = CountVectorizer(stop_words='english')

X = cvec.fit_transform(heavy_sample['posts'])

In [53]:
# post-vectorized shape
X.shape

(5575, 79517)

In [54]:
import warnings
warnings.filterwarnings("ignore")

In [55]:
# Train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size=0.2, stratify=y, random_state=42)

print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

(4460, 79517) (4460,) (1115, 79517) (1115,)


In [56]:
# Instantiate the model
logreg = LogisticRegression()

# Fit the model on the training data
logreg.fit(X_train, y_train)

# Make predictions
preds = logreg.predict(X_test)

# Model evaluation
report = classification_report(y_test, preds)

print(report)

              precision    recall  f1-score   support

           0       0.71      0.68      0.70       287
           1       0.72      0.78      0.75       356
           2       0.72      0.65      0.68       215
           3       0.70      0.70      0.70       257

    accuracy                           0.71      1115
   macro avg       0.71      0.71      0.71      1115
weighted avg       0.71      0.71      0.71      1115



In [57]:
logreg_train = logreg.score(X_train, y_train)

logreg_test = logreg.score(X_test, y_test)

In [58]:
print(f'Training Accuracy:  {logreg_train}')
print(f'Testing Accuracy:  {logreg_test}')

Training Accuracy:  0.862780269058296
Testing Accuracy:  0.7139013452914799


In [59]:
# Instantiate the model
nb = MultinomialNB()

# Fit the model on the training data
nb.fit(X_train, y_train)

# Make predictions
preds = nb.predict(X_test)

# Model evaluation
report = classification_report(y_test, preds)

print(report)

              precision    recall  f1-score   support

           0       0.64      0.55      0.59       287
           1       0.57      0.86      0.69       356
           2       0.74      0.33      0.45       215
           3       0.66      0.60      0.63       257

    accuracy                           0.62      1115
   macro avg       0.65      0.59      0.59      1115
weighted avg       0.64      0.62      0.60      1115



In [60]:
nb_train = nb.score(X_train, y_train)
nb_test = nb.score(X_test, y_test)

In [62]:
print(f'Training Accuracy:  {nb_train}')
print(f'Testing Accuracy:  {nb_test}')

Training Accuracy:  0.9394618834080718
Testing Accuracy:  0.6188340807174888


In [63]:
# Instantiate the model
knn = KNeighborsClassifier()

# Fit the model on the training data
knn.fit(X_train, y_train)

# Make predictions
preds = knn.predict(X_test)

# Model evaluation
report = classification_report(y_test, preds)

print(report)

              precision    recall  f1-score   support

           0       0.33      0.47      0.38       287
           1       0.43      0.50      0.46       356
           2       0.31      0.18      0.22       215
           3       0.37      0.24      0.29       257

    accuracy                           0.37      1115
   macro avg       0.36      0.35      0.34      1115
weighted avg       0.37      0.37      0.36      1115



In [64]:
knn_train = knn.score(X_train, y_train)
knn_test = knn.score(X_test, y_test)

In [65]:
print(f'Training Accuracy:  {knn_train}')
print(f'Testing Accuracy:  {knn_test}')

Training Accuracy:  0.5728699551569507
Testing Accuracy:  0.368609865470852


In [66]:
# Instantiate the model
rf = RandomForestClassifier()

# Fit the model on the training data
rf.fit(X_train, y_train)

# Make predictions
preds = rf.predict(X_test)

# Model evaluation
report = classification_report(y_test, preds)

print(report)

              precision    recall  f1-score   support

           0       0.67      0.46      0.54       287
           1       0.49      0.90      0.64       356
           2       0.79      0.36      0.50       215
           3       0.70      0.45      0.55       257

    accuracy                           0.58      1115
   macro avg       0.66      0.54      0.56      1115
weighted avg       0.64      0.58      0.56      1115



In [67]:
rf_train = rf.score(X_train, y_train)
rf_test = rf.score(X_test, y_test)

In [68]:
print(f'Training Accuracy:  {rf_train}')
print(f'Testing Accuracy:  {rf_test}')

Training Accuracy:  1.0
Testing Accuracy:  0.579372197309417


### saving heavy sample for next model tuning

In [69]:
heavy_sample.to_csv('../data/heavy_sample.csv', index = False)

### Medium Class Modeling

In [None]:
ENTP     685
ENFP     675
ISTP     337
ISFP     271

In [18]:
medium_class = ['ENTP','ENFP','ISTP','ISFP']

In [19]:
medium_sample = df[df['type'].isin(medium_class)]

In [20]:
medium_sample.type.unique()

array(['ENTP', 'ENFP', 'ISFP', 'ISTP'], dtype=object)

In [21]:
medium_sample.shape

(1901, 3)

In [22]:
medium_sample['type'].value_counts(normalize=True)

ENTP    0.350868
ENFP    0.336139
ISTP    0.172541
ISFP    0.140452
Name: type, dtype: float64

In [23]:
# Binarizing MBTI personality types(target feature) using LabelEncoder

encoder = LabelEncoder()
medium_sample['class'] = encoder.fit_transform(medium_sample['type'])

# Defining y (target feature)
y = medium_sample['class']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [24]:
# Making sure that label encoding successully binarized type column

print(len(heavy_sample['class'].unique()))
print(len(heavy_sample['class'].unique()))


4
4


In [25]:
medium_sample.head()

Unnamed: 0,type,posts,no. of. words,class
1,ENTP,finding lack these posts very alarming eo...,803,1
25,ENFP,doesnt want trip without staying behin...,389,0
26,ISFP,they paint without numbers guess istp ...,607,2
37,ENFP,enfps eostokendot posted this thread phil...,1063,0
39,ISTP,eostokendot from what read about enneag...,1070,3


In [26]:
# Preparing posts for model by vectorzing and filtering stop-words

cvec = CountVectorizer(stop_words='english')

X = cvec.fit_transform(medium_sample['posts'])

In [27]:
# post-vectorized shape
X.shape

(1901, 40911)

In [28]:
# Train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size=0.2, stratify=y, random_state=42)

print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

(1520, 40911) (1520,) (381, 40911) (381,)


In [29]:
# Instantiate the model
logreg = LogisticRegression()

# Fit the model on the training data
logreg.fit(X_train, y_train)

# Make predictions
preds = logreg.predict(X_test)

# Model evaluation
report = classification_report(y_test, preds)

print(report)

              precision    recall  f1-score   support

           0       0.76      0.79      0.77       128
           1       0.76      0.81      0.79       134
           2       0.82      0.60      0.70        53
           3       0.80      0.80      0.80        66

    accuracy                           0.77       381
   macro avg       0.79      0.75      0.76       381
weighted avg       0.78      0.77      0.77       381



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [30]:
logreg_train = logreg.score(X_train, y_train)

logreg_test = logreg.score(X_test, y_test)

In [31]:
print(f'Training Accuracy:  {logreg_train}')
print(f'Testing Accuracy:  {logreg_test}')

Training Accuracy:  0.9480263157894737
Testing Accuracy:  0.7742782152230971


In [32]:
# Instantiate the model
nb = MultinomialNB()

# Fit the model on the training data
nb.fit(X_train, y_train)

# Make predictions
preds = nb.predict(X_test)

# Model evaluation
report = classification_report(y_test, preds)

print(report)

              precision    recall  f1-score   support

           0       0.58      0.78      0.67       128
           1       0.58      0.80      0.67       134
           2       0.67      0.04      0.07        53
           3       0.77      0.26      0.39        66

    accuracy                           0.59       381
   macro avg       0.65      0.47      0.45       381
weighted avg       0.63      0.59      0.54       381



In [33]:
nb_train = nb.score(X_train, y_train)
nb_test = nb.score(X_test, y_test)

In [34]:
print(f'Training Accuracy:  {nb_train}')
print(f'Testing Accuracy:  {nb_test}')

Training Accuracy:  0.9006578947368421
Testing Accuracy:  0.5931758530183727


In [35]:
# Instantiate the model
knn = KNeighborsClassifier()

# Fit the model on the training data
knn.fit(X_train, y_train)

# Make predictions
preds = knn.predict(X_test)

# Model evaluation
report = classification_report(y_test, preds)

print(report)

              precision    recall  f1-score   support

           0       0.41      0.71      0.52       128
           1       0.50      0.38      0.43       134
           2       0.20      0.06      0.09        53
           3       0.40      0.26      0.31        66

    accuracy                           0.43       381
   macro avg       0.38      0.35      0.34       381
weighted avg       0.41      0.43      0.39       381



In [36]:
knn_train = knn.score(X_train, y_train)
knn_test = knn.score(X_test, y_test)

In [37]:
print(f'Training Accuracy:  {knn_train}')
print(f'Testing Accuracy:  {knn_test}')

Training Accuracy:  0.6118421052631579
Testing Accuracy:  0.4251968503937008


In [38]:
# Instantiate the model
rf = RandomForestClassifier()

# Fit the model on the training data
rf.fit(X_train, y_train)

# Make predictions
preds = rf.predict(X_test)

# Model evaluation
report = classification_report(y_test, preds)

print(report)

              precision    recall  f1-score   support

           0       0.58      0.77      0.66       128
           1       0.55      0.77      0.64       134
           2       0.90      0.17      0.29        53
           3       0.94      0.24      0.39        66

    accuracy                           0.59       381
   macro avg       0.74      0.49      0.49       381
weighted avg       0.68      0.59      0.56       381



In [39]:
rf_train = rf.score(X_train, y_train)
rf_test = rf.score(X_test, y_test)

In [40]:
print(f'Training Accuracy:  {rf_train}')
print(f'Testing Accuracy:  {rf_test}')

Training Accuracy:  1.0
Testing Accuracy:  0.5931758530183727
