## Splitting the target

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import learning_curve
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve

In [3]:
data = pd.read_csv(r'../data/personalities_cleaned.csv')

In [4]:
data2 = data.copy()

In [5]:
data2['I-E'] = data2['type'].astype(str).str[0]
data2['N-S'] = data2['type'].astype(str).str[1]
data2['T-F'] = data2['type'].astype(str).str[2]
data2['J-P'] = data2['type'].astype(str).str[3]


In [6]:
data2 = data2.drop(columns=['Unnamed: 0', 'posts', 'text_processed'])

In [7]:
data2.isnull().sum()


type          0
text_ready    1
I-E           0
N-S           0
T-F           0
J-P           0
dtype: int64

In [8]:
df1 = data2[data2.isna().any(axis=1)]
df1

Unnamed: 0,type,text_ready,I-E,N-S,T-F,J-P
3559,INFP,,I,N,F,P


In [9]:
data2.drop(data2.index[3559], inplace=True)

In [10]:
data2.isnull().sum()

type          0
text_ready    0
I-E           0
N-S           0
T-F           0
J-P           0
dtype: int64

In [11]:
data2.head()

Unnamed: 0,type,text_ready,I-E,N-S,T-F,J-P
0,INFJ,intj moment sportscent top ten play prank ha l...,I,N,F,J
1,ENTP,find lack post veri alarm sex bore posit often...,E,N,T,P
2,INTP,good one cours say know bless cur doe absolut ...,I,N,T,P
3,INTJ,dear intp enjoy convers day esoter gab natur u...,I,N,T,J
4,ENTJ,fire anoth silli misconcept approach logic go ...,E,N,T,J


In [12]:
# learning with the model

In [13]:
# defining learnings for each 
x = data2['text_ready']

y_IE = data2['I-E']
y_NS = data2['N-S']
y_TF = data2['T-F']
y_JP = data2['J-P']

In [14]:
# for vector functions
from sklearn.feature_extraction.text import CountVectorizer

vector = CountVectorizer(ngram_range=(2, 2)).fit(x) 
X = vector.transform(x)

In [15]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler

svd = TruncatedSVD(n_components = 100)
reduced = svd.fit_transform(X)

scaler = MinMaxScaler()
scaled = scaler.fit_transform(reduced)

#### Optimizing Parameters

In [16]:
y_IE.value_counts()

I    6675
E    1999
Name: I-E, dtype: int64

In [17]:
from sklearn.neighbors import KNeighborsClassifier


for x in range(1, 26, 2):
    x_train, x_test, y_train, y_test = train_test_split(reduced, y_IE, test_size=0.2)
    knn = KNeighborsClassifier(n_neighbors = x)
    clf = knn.fit(x_train, y_train)
    predknn = clf.predict(x_test)
    print("I-E Accuracy for ", x, " neigbours.")
    print('Label I-E test score is :',accuracy_score(y_test, predknn))
    print("Confusion Matrix for KNeighbors:")
    print(confusion_matrix(y_test, predknn))
    #print("Score:",round(accuracy_score(predknn, y_test)*100,2))

#print("*" * 100)

I-E Accuracy for  1  neigbours.
Label I-E test score is : 0.6610951008645534
Confusion Matrix for KNeighbors:
[[  88  293]
 [ 295 1059]]
I-E Accuracy for  3  neigbours.
Label I-E test score is : 0.7066282420749279
Confusion Matrix for KNeighbors:
[[  64  373]
 [ 136 1162]]
I-E Accuracy for  5  neigbours.
Label I-E test score is : 0.7360230547550433
Confusion Matrix for KNeighbors:
[[  45  352]
 [ 106 1232]]
I-E Accuracy for  7  neigbours.
Label I-E test score is : 0.7636887608069164
Confusion Matrix for KNeighbors:
[[  17  363]
 [  47 1308]]
I-E Accuracy for  9  neigbours.
Label I-E test score is : 0.7688760806916427
Confusion Matrix for KNeighbors:
[[  18  357]
 [  44 1316]]
I-E Accuracy for  11  neigbours.
Label I-E test score is : 0.7475504322766571
Confusion Matrix for KNeighbors:
[[  14  403]
 [  35 1283]]
I-E Accuracy for  13  neigbours.
Label I-E test score is : 0.7734870317002882
Confusion Matrix for KNeighbors:
[[   7  375]
 [  18 1335]]
I-E Accuracy for  15  neigbours.
Label 

In [18]:
from sklearn.neighbors import KNeighborsClassifier


for x in range(1, 26, 2):
    x_train, x_test, y_train, y_test = train_test_split(scaled, y_IE, test_size=0.2)
    knn = KNeighborsClassifier(n_neighbors = x)
    clf = knn.fit(x_train, y_train)
    predknn = clf.predict(x_test)
    print("I-E Accuracy for ", x, " neigbours.")
    print('Label I-E test score is :',accuracy_score(y_test, predknn))
    print("Confusion Matrix for KNeighbors:")
    print(confusion_matrix(y_test, predknn))

I-E Accuracy for  1  neigbours.
Label I-E test score is : 0.6570605187319885
Confusion Matrix for KNeighbors:
[[  91  308]
 [ 287 1049]]
I-E Accuracy for  3  neigbours.
Label I-E test score is : 0.7066282420749279
Confusion Matrix for KNeighbors:
[[  63  356]
 [ 153 1163]]
I-E Accuracy for  5  neigbours.
Label I-E test score is : 0.729106628242075
Confusion Matrix for KNeighbors:
[[  38  374]
 [  96 1227]]
I-E Accuracy for  7  neigbours.
Label I-E test score is : 0.7596541786743516
Confusion Matrix for KNeighbors:
[[  25  367]
 [  50 1293]]
I-E Accuracy for  9  neigbours.
Label I-E test score is : 0.7613832853025937
Confusion Matrix for KNeighbors:
[[  17  382]
 [  32 1304]]
I-E Accuracy for  11  neigbours.
Label I-E test score is : 0.7538904899135447
Confusion Matrix for KNeighbors:
[[  19  394]
 [  33 1289]]
I-E Accuracy for  13  neigbours.
Label I-E test score is : 0.768299711815562
Confusion Matrix for KNeighbors:
[[  10  381]
 [  21 1323]]
I-E Accuracy for  15  neigbours.
Label I-

In [24]:
from sklearn.ensemble import RandomForestClassifier


for x in range(1, 16):
    rfn = RandomForestClassifier(max_depth= x, random_state=0)

    # NS
    x_train, x_test, y_train, y_test = train_test_split(reduced, y_NS, test_size=0.2)
    rfn.fit(x_train, y_train)
#    ieb_train = knn.score(x_train,y_train)
#    ieb_test = knn.score (x_train,y_train)
    predrfn = rfn.predict(x_test)
    print("I-E Accuracy for ", x, " max_depth.")
#    print('Label I-E train score is :',ieb_train)
    print(confusion_matrix(y_test,predrfn))
    print("Score:",round(accuracy_score(y_test,predrfn)*100,2))
    #print("Score:",round(accuracy_score(predknn, y_test)*100,2))



I-E Accuracy for  1  max_depth.
[[1492    0]
 [ 243    0]]
Score: 85.99
I-E Accuracy for  2  max_depth.
[[1510    0]
 [ 225    0]]
Score: 87.03
I-E Accuracy for  3  max_depth.
[[1503    0]
 [ 232    0]]
Score: 86.63
I-E Accuracy for  4  max_depth.
[[1487    0]
 [ 248    0]]
Score: 85.71
I-E Accuracy for  5  max_depth.
[[1502    0]
 [ 233    0]]
Score: 86.57
I-E Accuracy for  6  max_depth.
[[1499    0]
 [ 236    0]]
Score: 86.4
I-E Accuracy for  7  max_depth.
[[1498    0]
 [ 237    0]]
Score: 86.34
I-E Accuracy for  8  max_depth.
[[1525    0]
 [ 210    0]]
Score: 87.9
I-E Accuracy for  9  max_depth.
[[1511    0]
 [ 224    0]]
Score: 87.09
I-E Accuracy for  10  max_depth.
[[1490    0]
 [ 244    1]]
Score: 85.94
I-E Accuracy for  11  max_depth.
[[1504    2]
 [ 228    1]]
Score: 86.74
I-E Accuracy for  12  max_depth.
[[1470    3]
 [ 262    0]]
Score: 84.73
I-E Accuracy for  13  max_depth.
[[1494    1]
 [ 240    0]]
Score: 86.11
I-E Accuracy for  14  max_depth.
[[1471    5]
 [ 257    2]]
Sc

In [23]:
from sklearn.ensemble import RandomForestClassifier


for x in range(1, 16):
    rfn = RandomForestClassifier(max_depth= x, random_state=0)

    # NS
    x_train, x_test, y_train, y_test = train_test_split(scaled, y_NS, test_size=0.2)
    rfn.fit(x_train, y_train)
#    ieb_train = knn.score(x_train,y_train)
#    ieb_test = knn.score (x_train,y_train)
    predrfn = rfn.predict(x_test)
    print("I-E Accuracy for ", x, " max_depth.")
#    print('Label I-E train score is :',ieb_train)
    print(confusion_matrix(y_test,predrfn))
    print("Score:",round(accuracy_score(y_test,predrfn)*100,2))
    #print("Score:",round(accuracy_score(predknn, y_test)*100,2))

I-E Accuracy for  1  max_depth.
[[1490    0]
 [ 245    0]]
Score: 85.88
I-E Accuracy for  2  max_depth.
[[1500    0]
 [ 235    0]]
Score: 86.46
I-E Accuracy for  3  max_depth.
[[1502    0]
 [ 233    0]]
Score: 86.57
I-E Accuracy for  4  max_depth.
[[1503    0]
 [ 232    0]]
Score: 86.63
I-E Accuracy for  5  max_depth.
[[1478    0]
 [ 257    0]]
Score: 85.19
I-E Accuracy for  6  max_depth.
[[1503    0]
 [ 232    0]]
Score: 86.63
I-E Accuracy for  7  max_depth.
[[1497    0]
 [ 238    0]]
Score: 86.28
I-E Accuracy for  8  max_depth.
[[1507    0]
 [ 228    0]]
Score: 86.86
I-E Accuracy for  9  max_depth.
[[1510    1]
 [ 224    0]]
Score: 87.03
I-E Accuracy for  10  max_depth.
[[1492    0]
 [ 243    0]]
Score: 85.99
I-E Accuracy for  11  max_depth.
[[1484    1]
 [ 250    0]]
Score: 85.53
I-E Accuracy for  12  max_depth.
[[1493    2]
 [ 240    0]]
Score: 86.05
I-E Accuracy for  13  max_depth.
[[1505    2]
 [ 227    1]]
Score: 86.8
I-E Accuracy for  14  max_depth.
[[1516    1]
 [ 216    2]]
S

#### MultinomialNB

In [23]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(x_train, y_train)


# IE
x_train, x_test, y_train, y_test = train_test_split(scaled, y_IE, test_size=0.2, random_state=10)
mnb.fit(x_train, y_train)
ieb_train = mnb.score (x_train,y_train)
ieb_test = mnb.score (x_train,y_train)
predmnb = mnb.predict(x_test)
print("I-E RESULTS")
print('Label I-E train score is :',ieb_train)
print('Label I-E test score is :',ieb_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predmnb))
print("Score:",round(accuracy_score(y_test,predmnb)*100,2))
print("Classification Report:",classification_report(y_test,predmnb))
print("*" * 100)

# NS
x_train, x_test, y_train, y_test = train_test_split(scaled, y_NS, test_size=0.2, random_state=10)
mnb.fit(x_train, y_train)
nsb_train = mnb.score (x_train,y_train)
nsb_test = mnb.score (x_train,y_train)
predmnb = mnb.predict(x_test)
print("N-S RESULTS")
print('Label N-S train score is :',nsb_train)
print('Label N-S test score is :',nsb_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predmnb))
print("Score:",round(accuracy_score(y_test,predmnb)*100,2))
print("Classification Report:",classification_report(y_test,predmnb))
print("*" * 100)

# TF
x_train, x_test, y_train, y_test = train_test_split(scaled, y_TF, test_size=0.2, random_state=10)
mnb.fit(x_train, y_train)
tfb_train = mnb.score (x_train,y_train)
tfb_test = mnb.score (x_train,y_train)
predmnb = mnb.predict(x_test)
print("T-F Results")
print('Label T-F train score is :',tfb_train)
print('Label T-F test score is :',tfb_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predmnb))
print("Score:",round(accuracy_score(y_test,predmnb)*100,2))
print("Classification Report:",classification_report(y_test,predmnb))
print("*" * 100)

# JP
x_train, x_test, y_train, y_test = train_test_split(scaled, y_JP, test_size=0.2, random_state=10)
mnb.fit(x_train, y_train)
jpb_train = mnb.score (x_train,y_train)
jpb_test = mnb.score (x_train,y_train)
predmnb = mnb.predict(x_test)
print("J-P Results")
print('Label J-P train score is :',jpb_train)
print('Label J-P test score is :',jpb_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predmnb))
print("Score:",round(accuracy_score(y_test,predmnb)*100,2))
print("Classification Report:",classification_report(y_test,predmnb))
print("*" * 100)

I-E RESULTS
Label I-E train score is : 0.7685545467646635
Label I-E test score is : 0.7685545467646635
Confusion Matrix for Multinomial Naive Bayes:
[[   0  393]
 [   0 1342]]
Score: 77.35
Classification Report:               precision    recall  f1-score   support

           E       0.00      0.00      0.00       393
           I       0.77      1.00      0.87      1342

    accuracy                           0.77      1735
   macro avg       0.39      0.50      0.44      1735
weighted avg       0.60      0.77      0.67      1735

****************************************************************************************************
N-S RESULTS
Label N-S train score is : 0.8617956477878657
Label N-S test score is : 0.8617956477878657
Confusion Matrix for Multinomial Naive Bayes:
[[1497    0]
 [ 238    0]]
Score: 86.28
Classification Report:               precision    recall  f1-score   support

           N       0.86      1.00      0.93      1497
           S       0.00      0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


T-F Results
Label T-F train score is : 0.5379737714368065
Label T-F test score is : 0.5379737714368065
Confusion Matrix for Multinomial Naive Bayes:
[[960   0]
 [775   0]]
Score: 55.33
Classification Report:               precision    recall  f1-score   support

           F       0.55      1.00      0.71       960
           T       0.00      0.00      0.00       775

    accuracy                           0.55      1735
   macro avg       0.28      0.50      0.36      1735
weighted avg       0.31      0.55      0.39      1735

****************************************************************************************************
J-P Results
Label J-P train score is : 0.603112840466926
Label J-P test score is : 0.603112840466926
Confusion Matrix for Multinomial Naive Bayes:
[[   0  680]
 [   0 1055]]
Score: 60.81
Classification Report:               precision    recall  f1-score   support

           J       0.00      0.00      0.00       680
           P       0.61      1.00      0.76  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Decision Tree

In [24]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)


# IE
x_train, x_test, y_train, y_test = train_test_split(reduced, y_IE, test_size=0.2, random_state=10)
dt.fit(x_train, y_train)
ieb_train = dt.score (x_train,y_train)
ieb_test = dt.score (x_train,y_train)
preddt = dt.predict(x_test)
print("I-E RESULTS")
print('Label I-E train score is :',ieb_train)
print('Label I-E test score is :',ieb_test)
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test,preddt))
print("Score:",round(accuracy_score(y_test,preddt)*100,2))
print("Classification Report:",classification_report(y_test,preddt))
print("*" * 100)

# NS
x_train, x_test, y_train, y_test = train_test_split(reduced, y_NS, test_size=0.2, random_state=10)
dt.fit(x_train, y_train)
nsb_train = dt.score (x_train,y_train)
nsb_test = dt.score (x_train,y_train)
preddt = dt.predict(x_test)
print("N-S RESULTS")
print('Label N-S train score is :',nsb_train)
print('Label N-S test score is :',nsb_test)
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test,preddt))
print("Score:",round(accuracy_score(y_test,preddt)*100,2))
print("Classification Report:",classification_report(y_test,preddt))
print("*" * 100)

# TF
x_train, x_test, y_train, y_test = train_test_split(reduced, y_TF, test_size=0.2, random_state=10)
dt.fit(x_train, y_train)
tfb_train = dt.score (x_train,y_train)
tfb_test = dt.score (x_train,y_train)
preddt = dt.predict(x_test)
print("T-F Results")
print('Label T-F train score is :',tfb_train)
print('Label T-F test score is :',tfb_test)
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test,preddt))
print("Score:",round(accuracy_score(y_test,preddt)*100,2))
print("Classification Report:",classification_report(y_test,preddt))
print("*" * 100)

# JP
x_train, x_test, y_train, y_test = train_test_split(reduced, y_JP, test_size=0.2, random_state=10)
dt.fit(x_train, y_train)
jpb_train = dt.score (x_train,y_train)
jpb_test = dt.score (x_train,y_train)
preddt = dt.predict(x_test)
print("J-P Results")
print('Label J-P train score is :',jpb_train)
print('Label J-P test score is :',jpb_test)
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test,preddt))
print("Score:",round(accuracy_score(y_test,preddt)*100,2))
print("Classification Report:",classification_report(y_test,preddt))
print("*" * 100)

I-E RESULTS
Label I-E train score is : 1.0
Label I-E test score is : 1.0
Confusion Matrix for Decision Tree:
[[ 121  272]
 [ 333 1009]]
Score: 65.13
Classification Report:               precision    recall  f1-score   support

           E       0.27      0.31      0.29       393
           I       0.79      0.75      0.77      1342

    accuracy                           0.65      1735
   macro avg       0.53      0.53      0.53      1735
weighted avg       0.67      0.65      0.66      1735

****************************************************************************************************
N-S RESULTS
Label N-S train score is : 1.0
Label N-S test score is : 1.0
Confusion Matrix for Decision Tree:
[[1239  258]
 [ 196   42]]
Score: 73.83
Classification Report:               precision    recall  f1-score   support

           N       0.86      0.83      0.85      1497
           S       0.14      0.18      0.16       238

    accuracy                           0.74      1735
   macro a

In [25]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)


# IE
x_train, x_test, y_train, y_test = train_test_split(scaled, y_IE, test_size=0.2, random_state=10)
dt.fit(x_train, y_train)
ieb_train = dt.score (x_train,y_train)
ieb_test = dt.score (x_train,y_train)
preddt = dt.predict(x_test)
print("I-E RESULTS")
print('Label I-E train score is :',ieb_train)
print('Label I-E test score is :',ieb_test)
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test,preddt))
print("Score:",round(accuracy_score(y_test,preddt)*100,2))
print("Classification Report:",classification_report(y_test,preddt))
print("*" * 100)

# NS
x_train, x_test, y_train, y_test = train_test_split(scaled, y_NS, test_size=0.2, random_state=10)
dt.fit(x_train, y_train)
nsb_train = dt.score (x_train,y_train)
nsb_test = dt.score (x_train,y_train)
preddt = dt.predict(x_test)
print("N-S RESULTS")
print('Label N-S train score is :',nsb_train)
print('Label N-S test score is :',nsb_test)
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test,preddt))
print("Score:",round(accuracy_score(y_test,preddt)*100,2))
print("Classification Report:",classification_report(y_test,preddt))
print("*" * 100)

# TF
x_train, x_test, y_train, y_test = train_test_split(scaled, y_TF, test_size=0.2, random_state=10)
dt.fit(x_train, y_train)
tfb_train = dt.score (x_train,y_train)
tfb_test = dt.score (x_train,y_train)
preddt = dt.predict(x_test)
print("T-F Results")
print('Label T-F train score is :',tfb_train)
print('Label T-F test score is :',tfb_test)
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test,preddt))
print("Score:",round(accuracy_score(y_test,preddt)*100,2))
print("Classification Report:",classification_report(y_test,preddt))
print("*" * 100)

# JP
x_train, x_test, y_train, y_test = train_test_split(scaled, y_JP, test_size=0.2, random_state=10)
dt.fit(x_train, y_train)
jpb_train = dt.score (x_train,y_train)
jpb_test = dt.score (x_train,y_train)
preddt = dt.predict(x_test)
print("J-P Results")
print('Label J-P train score is :',jpb_train)
print('Label J-P test score is :',jpb_test)
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test,preddt))
print("Score:",round(accuracy_score(y_test,preddt)*100,2))
print("Classification Report:",classification_report(y_test,preddt))
print("*" * 100)

I-E RESULTS
Label I-E train score is : 1.0
Label I-E test score is : 1.0
Confusion Matrix for Decision Tree:
[[116 277]
 [357 985]]
Score: 63.46
Classification Report:               precision    recall  f1-score   support

           E       0.25      0.30      0.27       393
           I       0.78      0.73      0.76      1342

    accuracy                           0.63      1735
   macro avg       0.51      0.51      0.51      1735
weighted avg       0.66      0.63      0.65      1735

****************************************************************************************************
N-S RESULTS
Label N-S train score is : 1.0
Label N-S test score is : 1.0
Confusion Matrix for Decision Tree:
[[1253  244]
 [ 198   40]]
Score: 74.52
Classification Report:               precision    recall  f1-score   support

           N       0.86      0.84      0.85      1497
           S       0.14      0.17      0.15       238

    accuracy                           0.75      1735
   macro avg  

#### K Nearest Neighbour

In [25]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=15)



# IE
x_train, x_test, y_train, y_test = train_test_split(reduced, y_IE, test_size=0.2, random_state=10)
knn.fit(x_train, y_train)
ieb_train = knn.score (x_train,y_train)
ieb_test = knn.score (x_train,y_train)
predknn = knn.predict(x_test)
print("I-E RESULTS")
print('Label I-E train score is :',ieb_train)
print('Label I-E test score is :',ieb_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predknn))
print("Score:",round(accuracy_score(y_test,predknn)*100,2))
print("Classification Report:",classification_report(y_test,predknn))
print("*" * 100)

# NS
x_train, x_test, y_train, y_test = train_test_split(reduced, y_NS, test_size=0.2, random_state=10)
knn.fit(x_train, y_train)
nsb_train = knn.score (x_train,y_train)
nsb_test = knn.score (x_train,y_train)
predknn = knn.predict(x_test)
print("N-S RESULTS")
print('Label N-S train score is :',nsb_train)
print('Label N-S test score is :',nsb_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predknn))
print("Score:",round(accuracy_score(y_test,predknn)*100,2))
print("Classification Report:",classification_report(y_test,predknn))
print("*" * 100)

# TF
x_train, x_test, y_train, y_test = train_test_split(reduced, y_TF, test_size=0.2, random_state=10)
knn.fit(x_train, y_train)
tfb_train = knn.score (x_train,y_train)
tfb_test = knn.score (x_train,y_train)
predknn = knn.predict(x_test)
print("T-F Results")
print('Label T-F train score is :',tfb_train)
print('Label T-F test score is :',tfb_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predknn))
print("Score:",round(accuracy_score(y_test,predknn)*100,2))
print("Classification Report:",classification_report(y_test,predknn))
print("*" * 100)

# JP
x_train, x_test, y_train, y_test = train_test_split(reduced, y_JP, test_size=0.2, random_state=10)
knn.fit(x_train, y_train)
jpb_train = knn.score (x_train,y_train)
jpb_test = knn.score (x_train,y_train)
predknn = knn.predict(x_test)
print("J-P Results")
print('Label J-P train score is :',jpb_train)
print('Label J-P test score is :',jpb_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predknn))
print("Score:",round(accuracy_score(y_test,predknn)*100,2))
print("Classification Report:",classification_report(y_test,predknn))
print("*" * 100)

I-E RESULTS
Label I-E train score is : 0.7731661622712206
Label I-E test score is : 0.7731661622712206
Confusion Matrix for Multinomial Naive Bayes:
[[   8  385]
 [  14 1328]]
Score: 77.0
Classification Report:               precision    recall  f1-score   support

           E       0.36      0.02      0.04       393
           I       0.78      0.99      0.87      1342

    accuracy                           0.77      1735
   macro avg       0.57      0.50      0.45      1735
weighted avg       0.68      0.77      0.68      1735

****************************************************************************************************
N-S RESULTS
Label N-S train score is : 0.8622279867416054
Label N-S test score is : 0.8622279867416054
Confusion Matrix for Multinomial Naive Bayes:
[[1496    1]
 [ 238    0]]
Score: 86.22
Classification Report:               precision    recall  f1-score   support

           N       0.86      1.00      0.93      1497
           S       0.00      0.00      0

In [26]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=7)



# IE
x_train, x_test, y_train, y_test = train_test_split(reduced, y_IE, test_size=0.2, random_state=10)
knn.fit(x_train, y_train)
ieb_train = knn.score (x_train,y_train)
ieb_test = knn.score (x_train,y_train)
predknn = knn.predict(x_test)
print("I-E RESULTS")
print('Label I-E train score is :',ieb_train)
print('Label I-E test score is :',ieb_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predknn))
print("Score:",round(accuracy_score(y_test,predknn)*100,2))
print("Classification Report:",classification_report(y_test,predknn))
print("*" * 100)

# NS
x_train, x_test, y_train, y_test = train_test_split(reduced, y_NS, test_size=0.2, random_state=10)
knn.fit(x_train, y_train)
nsb_train = knn.score (x_train,y_train)
nsb_test = knn.score (x_train,y_train)
predknn = knn.predict(x_test)
print("N-S RESULTS")
print('Label N-S train score is :',nsb_train)
print('Label N-S test score is :',nsb_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predknn))
print("Score:",round(accuracy_score(y_test,predknn)*100,2))
print("Classification Report:",classification_report(y_test,predknn))
print("*" * 100)

# TF
x_train, x_test, y_train, y_test = train_test_split(reduced, y_TF, test_size=0.2, random_state=10)
knn.fit(x_train, y_train)
tfb_train = knn.score (x_train,y_train)
tfb_test = knn.score (x_train,y_train)
predknn = knn.predict(x_test)
print("T-F Results")
print('Label T-F train score is :',tfb_train)
print('Label T-F test score is :',tfb_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predknn))
print("Score:",round(accuracy_score(y_test,predknn)*100,2))
print("Classification Report:",classification_report(y_test,predknn))
print("*" * 100)

# JP
x_train, x_test, y_train, y_test = train_test_split(reduced, y_JP, test_size=0.2, random_state=10)
knn.fit(x_train, y_train)
jpb_train = knn.score (x_train,y_train)
jpb_test = knn.score (x_train,y_train)
predknn = knn.predict(x_test)
print("J-P Results")
print('Label J-P train score is :',jpb_train)
print('Label J-P test score is :',jpb_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predknn))
print("Score:",round(accuracy_score(y_test,predknn)*100,2))
print("Classification Report:",classification_report(y_test,predknn))
print("*" * 100)

I-E RESULTS
Label I-E train score is : 0.788586251621271
Label I-E test score is : 0.788586251621271
Confusion Matrix for Multinomial Naive Bayes:
[[  27  366]
 [  73 1269]]
Score: 74.7
Classification Report:               precision    recall  f1-score   support

           E       0.27      0.07      0.11       393
           I       0.78      0.95      0.85      1342

    accuracy                           0.75      1735
   macro avg       0.52      0.51      0.48      1735
weighted avg       0.66      0.75      0.68      1735

****************************************************************************************************
N-S RESULTS
Label N-S train score is : 0.8642455685257242
Label N-S test score is : 0.8642455685257242
Confusion Matrix for Multinomial Naive Bayes:
[[1479   18]
 [ 234    4]]
Score: 85.48
Classification Report:               precision    recall  f1-score   support

           N       0.86      0.99      0.92      1497
           S       0.18      0.02      0.0

#### Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
import joblib

rfn = RandomForestClassifier(max_depth=14, random_state=0)


# IE
x_train, x_test, y_train, y_test = train_test_split(scaled, y_IE, test_size=0.2, random_state=10)
rfn.fit(x_train, y_train)
model = rfn.fit(x_train, y_train)
ieb_train = rfn.score (x_train,y_train)
ieb_test = rfn.score (x_train,y_train)
predrfn = rfn.predict(x_test)
print("I-E RESULTS")
print('Label I-E train score is :',ieb_train)
print('Label I-E test score is :',ieb_test)
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test,predrfn))
print("Score:",round(accuracy_score(y_test,predrfn)*100,2))
print("Classification Report:",classification_report(y_test,predrfn))
filename = 'RF_IE.sav'
joblib.dump(model, filename)
print("*" * 100)

# NS
x_train, x_test, y_train, y_test = train_test_split(scaled, y_NS, test_size=0.2, random_state=10)
rfn.fit(x_train, y_train)
model = rfn.fit(x_train, y_train)
nsb_train = rfn.score (x_train,y_train)
nsb_test = rfn.score (x_train,y_train)
predrfn = rfn.predict(x_test)
print("N-S RESULTS")
print('Label N-S train score is :',nsb_train)
print('Label N-S test score is :',nsb_test)
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test,predrfn))
print("Score:",round(accuracy_score(y_test,predrfn)*100,2))
print("Classification Report:",classification_report(y_test,predrfn))
filename = 'RF_NS.sav'
joblib.dump(model, filename)
print("*" * 100)

# TF
x_train, x_test, y_train, y_test = train_test_split(scaled, y_TF, test_size=0.2, random_state=10)
rfn.fit(x_train, y_train)
model = rfn.fit(x_train, y_train)
tfb_train = rfn.score (x_train,y_train)
tfb_test = rfn.score (x_train,y_train)
predrfn = rfn.predict(x_test)
print("T-F Results")
print('Label T-F train score is :',tfb_train)
print('Label T-F test score is :',tfb_test)
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test,predrfn))
print("Score:",round(accuracy_score(y_test,predrfn)*100,2))
print("Classification Report:",classification_report(y_test,predrfn))
filename = 'RF_TF.sav'
joblib.dump(model, filename)
print("*" * 100)

# JP
x_train, x_test, y_train, y_test = train_test_split(scaled, y_JP, test_size=0.2, random_state=10)
rfn.fit(x_train, y_train)
model = rfn.fit(x_train, y_train)
jpb_train = rfn.score (x_train,y_train)
jpb_test = rfn.score (x_train,y_train)
predrfn = rfn.predict(x_test)
print("J-P Results")
print('Label J-P train score is :',jpb_train)
print('Label J-P test score is :',jpb_test)
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test,predrfn))
print("Score:",round(accuracy_score(y_test,predrfn)*100,2))
print("Classification Report:",classification_report(y_test,predrfn))
filename = 'RF_JP.sav'
joblib.dump(model, filename)
print("*" * 100)

I-E RESULTS
Label I-E train score is : 0.9136763222366335
Label I-E test score is : 0.9136763222366335
Confusion Matrix for Random Forest:
[[   2  391]
 [  10 1332]]
Score: 76.89
Classification Report:               precision    recall  f1-score   support

           E       0.17      0.01      0.01       393
           I       0.77      0.99      0.87      1342

    accuracy                           0.77      1735
   macro avg       0.47      0.50      0.44      1735
weighted avg       0.64      0.77      0.67      1735

****************************************************************************************************
N-S RESULTS
Label N-S train score is : 0.8962386511024644
Label N-S test score is : 0.8962386511024644
Confusion Matrix for Random Forest:
[[1495    2]
 [ 238    0]]
Score: 86.17
Classification Report:               precision    recall  f1-score   support

           N       0.86      1.00      0.93      1497
           S       0.00      0.00      0.00       238

    

#### Neural Network

In [33]:
import pandas as pd
import numpy as np
import random
import re

import keras
from keras import models
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score

In [34]:
data_neural = data2.copy()

data_neural['I-E'] = data_neural['I-E'].astype('category')
data_neural['N-S'] = data_neural['N-S'].astype('category')
data_neural['T-F'] = data_neural['T-F'].astype('category')
data_neural['J-P'] = data_neural['J-P'].astype('category')

cat_columns = data_neural.select_dtypes(['category']).columns

data_neural[cat_columns] = data_neural[cat_columns].apply(lambda x: x.cat.codes)

data_neural

Unnamed: 0,type,text_ready,I-E,N-S,T-F,J-P
0,INFJ,intj moment sportscent top ten play prank ha l...,1,0,0,0
1,ENTP,find lack post veri alarm sex bore posit often...,0,0,1,1
2,INTP,good one cours say know bless cur doe absolut ...,1,0,1,1
3,INTJ,dear intp enjoy convers day esoter gab natur u...,1,0,1,0
4,ENTJ,fire anoth silli misconcept approach logic go ...,0,0,1,0
...,...,...,...,...,...,...
8670,ISFP,becaus alway think cat fi dom reason websit be...,1,1,0,1
8671,ENFP,thi thread alreadi exist someplac el doe heck ...,0,0,0,1
8672,INTP,mani question thing would take purpl pill pick...,1,0,1,1
8673,INFP,veri conflict right come want child honestli m...,1,0,0,1


In [35]:

x = data_neural['text_ready']

y_IE = data_neural['I-E']
y_NS = data_neural['N-S']
y_TF = data_neural['T-F']
y_JP = data_neural['J-P']

from sklearn.feature_extraction.text import CountVectorizer

vector = CountVectorizer(ngram_range=(2, 2)).fit(x) 
X = vector.transform(x)

In [36]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(units=256, kernel_initializer='uniform', activation='relu', input_dim=1986297))
model.add(tf.keras.layers.Dense(units=128, kernel_initializer='uniform', activation='relu'))
model.add(tf.keras.layers.Dense(units=64, kernel_initializer='uniform', activation='relu'))
model.add(tf.keras.layers.Dense(activation = 'sigmoid', units = 1, kernel_initializer = 'uniform')) # output layer has number of outputs not number of neurons

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
# calculating loss, model is always trying to minimize loss, adam is the default optimize


# model.fit(X_train, y_train, epochs = 10, batch_size= 25)

# val_loss, val_acc = model.evaluate(X_test, y_test, verbose=0)
# print(f'Test loss: {val_loss}')
# print(f'Test accuracy: {val_acc}')

In [37]:
X_train.shape

(6939, 1986297)

In [38]:


# IE
X_train, X_test, y_train, y_test = train_test_split(X, y_IE, test_size=0.2, random_state=10)
model.fit(X_train, y_train, epochs = 10, batch_size= 25)
predmodel = model.predict(x_test)
print("I-E RESULTS")
# print("Confusion Matrix for Multinomial Naive Bayes:")
# print(confusion_matrix(y_test,predmodel))
# print("Score:",round(accuracy_score(y_test,predmodel)*100,2))
# print("Classification Report:",classification_report(y_test,predmodel))
val_loss, val_acc = model.evaluate(X_test, y_test, verbose=0)
print(f'Test loss: {val_loss}')
print(f'Test accuracy: {val_acc}')
print("*" * 100)

# NS
X_train, X_test, y_train, y_test = train_test_split(X, y_NS, test_size=0.2, random_state=10)
model.fit(X_train, y_train, epochs = 10, batch_size= 25)
predmodel = model.predict(x_test)
print("N-S RESULTS")
# print("Confusion Matrix for Multinomial Naive Bayes:")
# print(confusion_matrix(y_test,predmodel))
# print("Score:",round(accuracy_score(y_test,predmodel)*100,2))
# print("Classification Report:",classification_report(y_test,predmodel))
val_loss, val_acc = model.evaluate(X_test, y_test, verbose=0)
print(f'Test loss: {val_loss}')
print(f'Test accuracy: {val_acc}')
print("*" * 100)

# TF
X_train, X_test, y_train, y_test = train_test_split(X, y_TF, test_size=0.2, random_state=10)
model.fit(X_train, y_train, epochs = 10, batch_size= 25)
predmodel = model.predict(x_test)
print("T-F Results")
# print("Confusion Matrix for Multinomial Naive Bayes:")
# print(confusion_matrix(y_test,predmodel))
# print("Score:",round(accuracy_score(y_test,predmodel)*100,2))
# print("Classification Report:",classification_report(y_test,predmodel))
val_loss, val_acc = model.evaluate(X_test, y_test, verbose=0)
print(f'Test loss: {val_loss}')
print(f'Test accuracy: {val_acc}')
print("*" * 100)

# JP
X_train, X_test, y_train, y_test = train_test_split(X, y_JP, test_size=0.2, random_state=10)
model.fit(X_train, y_train, epochs = 10, batch_size= 25)
predmodel = model.predict(x_test)
print("J-P Results")
# print("Confusion Matrix for Multinomial Naive Bayes:")
# print(confusion_matrix(y_test,predmodel))
# print("Score:",round(accuracy_score(y_test,predmodel)*100,2))
# print("Classification Report:",classification_report(y_test,predmodel))
val_loss, val_acc = model.evaluate(X_test, y_test, verbose=0)
print(f'Test loss: {val_loss}')
print(f'Test accuracy: {val_acc}')
print("*" * 100)

Epoch 1/10


InvalidArgumentError:  TypeError: 'SparseTensor' object is not subscriptable
Traceback (most recent call last):

  File "/usr/local/Cellar/jupyterlab/1.2.4/libexec/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 241, in __call__
    return func(device, token, args)

  File "/usr/local/Cellar/jupyterlab/1.2.4/libexec/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 130, in __call__
    ret = self._func(*args)

  File "/usr/local/Cellar/jupyterlab/1.2.4/libexec/lib/python3.7/site-packages/tensorflow/python/autograph/impl/api.py", line 309, in wrapper
    return func(*args, **kwargs)

  File "/usr/local/Cellar/jupyterlab/1.2.4/libexec/lib/python3.7/site-packages/tensorflow/python/keras/engine/data_adapter.py", line 513, in py_method
    return [slice_array(inp) for inp in flat_inputs]

  File "/usr/local/Cellar/jupyterlab/1.2.4/libexec/lib/python3.7/site-packages/tensorflow/python/keras/engine/data_adapter.py", line 513, in <listcomp>
    return [slice_array(inp) for inp in flat_inputs]

  File "/usr/local/Cellar/jupyterlab/1.2.4/libexec/lib/python3.7/site-packages/tensorflow/python/keras/engine/data_adapter.py", line 512, in slice_array
    contiguous=contiguous)

  File "/usr/local/Cellar/jupyterlab/1.2.4/libexec/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_utils.py", line 391, in slice_arrays
    entries = [[x[i:i + 1] for i in indices] for x in arrays]

  File "/usr/local/Cellar/jupyterlab/1.2.4/libexec/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_utils.py", line 391, in <listcomp>
    entries = [[x[i:i + 1] for i in indices] for x in arrays]

  File "/usr/local/Cellar/jupyterlab/1.2.4/libexec/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_utils.py", line 391, in <listcomp>
    entries = [[x[i:i + 1] for i in indices] for x in arrays]

TypeError: 'SparseTensor' object is not subscriptable


	 [[{{node EagerPyFunc}}]]
	 [[IteratorGetNext]] [Op:__inference_train_function_1612]

Function call stack:
train_function


In [None]:
XG BOOST

In [None]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(x_train, y_train)


# IE
x_train, x_test, y_train, y_test = train_test_split(X, y_IE, test_size=0.2, random_state=10)
mnb.fit(x_train, y_train)
ieb_train = mnb.score (x_train,y_train)
ieb_test = mnb.score (x_train,y_train)
predmnb = mnb.predict(x_test)
print("I-E RESULTS")
print('Label I-E train score is :',ieb_train)
print('Label I-E test score is :',ieb_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predmnb))
print("Score:",round(accuracy_score(y_test,predmnb)*100,2))
print("Classification Report:",classification_report(y_test,predmnb))
print("*" * 100)

# NS
x_train, x_test, y_train, y_test = train_test_split(X, y_NS, test_size=0.2, random_state=10)
mnb.fit(x_train, y_train)
nsb_train = mnb.score (x_train,y_train)
nsb_test = mnb.score (x_train,y_train)
predmnb = mnb.predict(x_test)
print("N-S RESULTS")
print('Label N-S train score is :',nsb_train)
print('Label N-S test score is :',nsb_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predmnb))
print("Score:",round(accuracy_score(y_test,predmnb)*100,2))
print("Classification Report:",classification_report(y_test,predmnb))
print("*" * 100)

# TF
x_train, x_test, y_train, y_test = train_test_split(X, y_TF, test_size=0.2, random_state=10)
mnb.fit(x_train, y_train)
tfb_train = mnb.score (x_train,y_train)
tfb_test = mnb.score (x_train,y_train)
predmnb = mnb.predict(x_test)
print("T-F Results")
print('Label T-F train score is :',tfb_train)
print('Label T-F test score is :',tfb_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predmnb))
print("Score:",round(accuracy_score(y_test,predmnb)*100,2))
print("Classification Report:",classification_report(y_test,predmnb))
print("*" * 100)

# JP
x_train, x_test, y_train, y_test = train_test_split(X, y_JP, test_size=0.2, random_state=10)
mnb.fit(x_train, y_train)
jpb_train = mnb.score (x_train,y_train)
jpb_test = mnb.score (x_train,y_train)
predmnb = mnb.predict(x_test)
print("J-P Results")
print('Label J-P train score is :',jpb_train)
print('Label J-P test score is :',jpb_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predmnb))
print("Score:",round(accuracy_score(y_test,predmnb)*100,2))
print("Classification Report:",classification_report(y_test,predmnb))
print("*" * 100)