## General NLP

In [33]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk import wordnet
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve

In [12]:
data = pd.read_csv(r'../data/mbti_1.csv')
data.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [13]:
def processed(x):
    pattern = r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
    cleaned = re.sub(pattern, ' ', str(x)) 
    cleaned = re.sub(r"[^A-Za-z0-9]+", " ", cleaned)
    cleaned = re.sub(" \d+", " ", cleaned)
    cleaned = cleaned.lower().strip()

    pattern2 = '\w{1,}'
    bag = re.findall(pattern2, cleaned)

    porter = PorterStemmer()
    porter_bag = [porter.stem(word) for word in bag]

    lemmatizer = wordnet.WordNetLemmatizer()
    bag_of_words = [lemmatizer.lemmatize(word) for word in porter_bag]

    stopwords_e = stopwords.words('english')
    bag_of_words = [word for word in bag_of_words if word not in stopwords_e]

    return bag_of_words

In [14]:
text_processed = [processed(i) for i in data.posts]

In [15]:
data['text_processed'] = text_processed

In [16]:
data.head()

Unnamed: 0,type,posts,text_processed
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,"[intj, moment, sportscent, top, ten, play, pra..."
1,ENTP,'I'm finding the lack of me in these posts ver...,"[find, lack, post, veri, alarm, sex, bore, pos..."
2,INTP,'Good one _____ https://www.youtube.com/wat...,"[good, one, cours, say, know, bless, cur, doe,..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","[dear, intp, enjoy, convers, day, esoter, gab,..."
4,ENTJ,'You're fired.|||That's another silly misconce...,"[fire, anoth, silli, misconcept, approach, log..."


In [25]:
data['text_ready'] = data.text_processed.apply(' '.join)

In [26]:
data.head()

Unnamed: 0,type,posts,text_processed,text_ready
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,"[intj, moment, sportscent, top, ten, play, pra...",intj moment sportscent top ten play prank ha l...
1,ENTP,'I'm finding the lack of me in these posts ver...,"[find, lack, post, veri, alarm, sex, bore, pos...",find lack post veri alarm sex bore posit often...
2,INTP,'Good one _____ https://www.youtube.com/wat...,"[good, one, cours, say, know, bless, cur, doe,...",good one cours say know bless cur doe absolut ...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","[dear, intp, enjoy, convers, day, esoter, gab,...",dear intp enjoy convers day esoter gab natur u...
4,ENTJ,'You're fired.|||That's another silly misconce...,"[fire, anoth, silli, misconcept, approach, log...",fire anoth silli misconcept approach logic go ...


In [46]:
data.to_csv(r'../data/personalities_cleaned.csv')

In [29]:
x = data['text_ready']
y = data['type']

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

vector = CountVectorizer(ngram_range=(2, 2)).fit(x) 
X = vector.transform(x)

In [31]:
print("Shape of the sparse matrix: ", X.shape)
print("Non-Zero occurences: ", X.nnz)

density = (X.nnz/(X.shape[0]*X.shape[1]))*100
print("Density of the matrix = ", density)

Shape of the sparse matrix:  (8675, 1986297)
Non-Zero occurences:  5339470
Density of the matrix =  0.03098735307727465


In [34]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [35]:
x_train.shape

(6940, 1986297)

In [36]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(x_train, y_train)

predmnb = mnb.predict(x_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predmnb))
print("Score:",round(accuracy_score(y_test,predmnb)*100,2))
print("Classification Report:",classification_report(y_test,predmnb))

Confusion Matrix for Multinomial Naive Bayes:
[[  0   0   0   0   0   0   0   0   8  22   0   1   0   0   0   0]
 [  0   1   0   0   0   0   0   0  14 116   1   1   0   0   0   0]
 [  0   0   0   0   0   0   0   0  12  41   1   1   0   0   0   0]
 [  0   1   0   0   0   0   0   0  29  88   0   6   0   0   0   0]
 [  0   0   0   0   0   0   0   0   2   7   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   3   6   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   9   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   4  13   0   0   0   0   0   0]
 [  0   2   0   0   0   0   0   0 118 173   0   2   0   0   0   0]
 [  0   1   0   1   0   0   0   0  16 361   2   1   0   0   0   0]
 [  0   1   0   1   0   0   0   0  52 144  18  17   0   0   0   0]
 [  0   0   0   0   0   0   0   0  26 164   2  40   0   1   0   0]
 [  0   0   0   0   0   0   0   0   4  23   0   0   0   0   0   0]
 [  0   1   0   0   0   0   0   0   6  53   0   0   0   0   0   0]
 [  0   0   0   

  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
mnb = MultinomialNB()
mnb.fit(x_train, y_train)

predmnb = mnb.predict(x_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predmnb))
print("Score:",round(accuracy_score(y_test,predmnb)*100,2))
print("Classification Report:",classification_report(y_test,predmnb))

Confusion Matrix for Multinomial Naive Bayes:
[[  0   0   0   0   0   0   0   0   0  31   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 133   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  55   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 124   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   9   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   9   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   9   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  17   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   1 294   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 382   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   1 231   0   1   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 233   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  27   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  60   0   0   0   0   0   0]
 [  0   0   0   

  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
preddt = dt.predict(x_test)
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test,preddt))
print("Score:",round(accuracy_score(y_test,preddt)*100,2))
print("Classification Report:",classification_report(y_test,preddt))

Confusion Matrix for Decision Tree:
[[  6   2   0   1   0   1   0   1   4   7   3   2   0   3   0   1]
 [  4  58   3   9   0   2   0   0  16  12  12  11   1   2   3   0]
 [  0   4  10   9   0   0   0   0   5   5   4   8   1   2   3   4]
 [  0   5   2  56   1   0   0   0  15  13   8  12   3   5   4   0]
 [  0   1   0   1   0   0   0   0   1   0   0   5   1   0   0   0]
 [  0   1   1   0   0   0   0   0   3   0   1   3   0   0   0   0]
 [  0   0   1   0   0   0   0   0   2   3   0   0   0   2   1   0]
 [  0   0   1   1   0   0   0   4   3   1   4   1   0   2   0   0]
 [  3   9   2  13   0   1   0   1 162  38  26  22   8   5   2   3]
 [  9  14   7  15   2   2   1   3  31 228  17  21   7  15   4   6]
 [  3  14   9  11   0   5   1   0  24  14 110  21   0  10   7   4]
 [  1   4   1  13   2   0   0   1  21  15  26 140   2   1   1   5]
 [  1   0   0   1   1   0   0   0   4   2   2   1  10   2   3   0]
 [  1   4   0   1   1   2   0   2   5  11   4   2   2  18   5   2]
 [  0   1   0   3   0   1 

In [51]:
from sklearn.ensemble import RandomForestClassifier

rfn = RandomForestClassifier(max_depth=5, random_state=0)
rfn.fit(x_train,y_train)
predrf = rfn.predict(x_test)
print("Confusion Matrix for Random Forest Classifier:")
print(confusion_matrix(y_test,predknn))
print("Score: ",round(accuracy_score(y_test,predrf)*100,2))
print("Classification Report:")
print(classification_report(y_test,predrf))

Confusion Matrix for Random Forest Classifier:
[[  8   6   0   1   0   0   0   0   7   9   0   0   0   0   0   0]
 [  1  69   1   5   0   0   0   0  10  45   1   0   0   1   0   0]
 [  2   7  11   2   0   0   0   0   8  22   2   0   0   0   0   1]
 [  0  11   1  40   1   0   0   0  24  39   5   3   0   0   0   0]
 [  0   0   0   0   8   0   0   0   0   1   0   0   0   0   0   0]
 [  0   1   1   0   1   1   0   0   3   2   0   0   0   0   0   0]
 [  0   1   1   0   0   0   2   0   1   4   0   0   0   0   0   0]
 [  0   2   0   0   0   0   0   7   3   3   0   1   0   1   0   0]
 [  3  18   0   3   2   0   0   1 174  86   4   3   0   1   0   0]
 [  2  26   0   2   1   0   0   1  58 284   5   1   0   1   1   0]
 [  1  31   0   8   2   0   0   0  37  81  57  10   5   0   1   0]
 [  1  14   2   9   2   0   0   0  45  95  10  55   0   0   0   0]
 [  0   1   0   0   0   0   0   0   6   3   1   0  16   0   0   0]
 [  0   7   0   1   0   0   0   1  10  32   0   0   0   9   0   0]
 [  0   3   0  

  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(x_train,y_train)
predknn = knn.predict(x_test)
print("Confusion Matrix for K Neighbors Classifier:")
print(confusion_matrix(y_test,predknn))
print("Score: ",round(accuracy_score(y_test,predknn)*100,2))
print("Classification Report:")
print(classification_report(y_test,predknn))

Confusion Matrix for K Neighbors Classifier:
[[  8   6   0   1   0   0   0   0   7   9   0   0   0   0   0   0]
 [  1  69   1   5   0   0   0   0  10  45   1   0   0   1   0   0]
 [  2   7  11   2   0   0   0   0   8  22   2   0   0   0   0   1]
 [  0  11   1  40   1   0   0   0  24  39   5   3   0   0   0   0]
 [  0   0   0   0   8   0   0   0   0   1   0   0   0   0   0   0]
 [  0   1   1   0   1   1   0   0   3   2   0   0   0   0   0   0]
 [  0   1   1   0   0   0   2   0   1   4   0   0   0   0   0   0]
 [  0   2   0   0   0   0   0   7   3   3   0   1   0   1   0   0]
 [  3  18   0   3   2   0   0   1 174  86   4   3   0   1   0   0]
 [  2  26   0   2   1   0   0   1  58 284   5   1   0   1   1   0]
 [  1  31   0   8   2   0   0   0  37  81  57  10   5   0   1   0]
 [  1  14   2   9   2   0   0   0  45  95  10  55   0   0   0   0]
 [  0   1   0   0   0   0   0   0   6   3   1   0  16   0   0   0]
 [  0   7   0   1   0   0   0   1  10  32   0   0   0   9   0   0]
 [  0   3   0   1

In [48]:
import sys
!{sys.executable} -m pip install --upgrade xgboost

Requirement already up-to-date: xgboost in /usr/local/Cellar/jupyterlab/1.2.4/libexec/lib/python3.7/site-packages (1.1.0)
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [49]:
import xgboost


XGBoostError: XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed (vcomp140.dll or libgomp-1.dll for Windows, libomp.dylib for Mac OSX, libgomp.so for Linux and other UNIX-like OSes). Mac OSX users: Run `brew install libomp` to install OpenMP runtime.
  * You are running 32-bit Python on a 64-bit OS
Error message(s): ['dlopen(/usr/local/Cellar/jupyterlab/1.2.4/libexec/lib/python3.7/site-packages/xgboost/lib/libxgboost.dylib, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib\n  Referenced from: /usr/local/Cellar/jupyterlab/1.2.4/libexec/lib/python3.7/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: image not found']


In [45]:
xgb = XGBClassifier()
xgb.fit(x_train,y_train)
predxgb = xgb.predict(x_test)
print("Confusion Matrix for XGBoost Classifier:")
print(confusion_matrix(y_test,predxgb))
print("Score: ",round(accuracy_score(y_test,predxgb)*100,2))
print("Classification Report:")
print(classification_report(y_test,predxgb))

NameError: name 'XGBClassifier' is not defined