In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
sns.set_style('darkgrid')

In [4]:
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [5]:
df = pd.read_csv('dataFiles/toy_data.csv')
df.head()

Unnamed: 0,type,posts,IE,NS,TF,JP
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,1,1,0,1
1,INTP,'Good one _____ https://www.youtube.com/wat...,1,1,1,0
2,INTJ,"'Dear INTP, I enjoyed our conversation the o...",1,1,1,1
3,INTJ,'18/37 @.@|||Science is not perfect. No scien...,1,1,1,1
4,INFJ,"'No, I can't draw on my own nails (haha). Thos...",1,1,0,1


In [6]:
# processing the data
def preprocess_string(x_str, return_joined=True):
    '''Returns a cleaned string specifically from the MBTI dataset.
    If return_joined is True, the tokens are joined into a single string so
    that it can be passed into SciKit learn's frequency counter. Otherwise
    the tokens are returned as a list.'''

    # lower
    x_str = x_str.lower()

    # remove |||
    x_str = re.sub("[]|||[]", " ", x_str)

    # remove http links
    x_str = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', x_str)

    # remove puncuation
    x_str = "".join([ci for ci in x_str if ci not in string.punctuation])

    # tokenise
    tokens = nltk.word_tokenize(x_str)

    # stem
    #porter = nltk.PorterStemmer()
    #stemmed_tokens = [porter.stem(token) for token in tokens]
    # lemmatize
    lemm = WordNetLemmatizer()
    stemmed_tokens = [lemm.lemmatize(token) for token in tokens]

    # remove stop words
    stopped_tokens = [ti for ti in stemmed_tokens if ti not in stopwords.words("english")]

    # remove MBTI types
    MBTI_types = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
                  'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ']
    MBTI_types = [ti.lower() for ti in MBTI_types]

    final_tokens = [wi for wi in stopped_tokens if wi not in MBTI_types]

    if return_joined:
        return " ".join([ci for ci in final_tokens])

    return final_tokens

In [7]:
df['processed_post'] = df['posts'].apply(lambda x:preprocess_string(x,True))

In [8]:
df.head()

Unnamed: 0,type,posts,IE,NS,TF,JP,processed_post
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,1,1,0,1,moment sportscenter top ten play prank ha life...
1,INTP,'Good one _____ https://www.youtube.com/wat...,1,1,1,0,good one course say know thats blessing curse ...
2,INTJ,"'Dear INTP, I enjoyed our conversation the o...",1,1,1,1,dear enjoyed conversation day esoteric gabbing...
3,INTJ,'18/37 @.@|||Science is not perfect. No scien...,1,1,1,1,1837 science perfect scientist claim scientifi...
4,INFJ,"'No, I can't draw on my own nails (haha). Thos...",1,1,0,1,cant draw nail haha done professional nail yes...


## Train Test Split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X = df['processed_post']
y = df['type']

y_IE = df['IE']
y_NS = df['NS']
y_TF = df['TF']
y_JP = df['JP']

In [11]:
X_train, X_test, y_ie_train, y_ie_test = train_test_split(X, y_IE, test_size=0.3, random_state = 101)
X_train, X_test, y_ns_train, y_ns_test = train_test_split(X, y_NS, test_size=0.3, random_state = 101)
X_train, X_test, y_tf_train, y_tf_test = train_test_split(X, y_TF, test_size=0.3, random_state = 101)
X_train, X_test, y_jp_train, y_jp_test = train_test_split(X, y_JP, test_size=0.3, random_state = 101)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

In [13]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score

## Random Forrest

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
rf_pipeline_1 = Pipeline([
    ('vector',CountVectorizer(ngram_range = (2,2), analyzer='char')),
    ('tfidf',TfidfTransformer()),
    ('Classifier',RandomForestClassifier(n_estimators=10))
])

In [31]:
rf_pipeline_1.fit(X_train, y_ie_train)

Pipeline(memory=None,
         steps=[('vector',
                 CountVectorizer(analyzer='char', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(2, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=N...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0

In [32]:
predictions_1 = rf_pipeline_1.predict(X_test)

In [18]:
print(classification_report(y_ie_test, predictions_1))

              precision    recall  f1-score   support

           0       1.00      0.54      0.70        37
           1       0.83      1.00      0.91        83

    accuracy                           0.86       120
   macro avg       0.92      0.77      0.80       120
weighted avg       0.88      0.86      0.84       120



In [33]:
print(classification_report(y_ie_test, predictions_1))

              precision    recall  f1-score   support

           0       1.00      0.78      0.88        37
           1       0.91      1.00      0.95        83

    accuracy                           0.93       120
   macro avg       0.96      0.89      0.92       120
weighted avg       0.94      0.93      0.93       120



In [20]:
df_test = pd.read_csv('dataFiles/dataFile.csv')
df_test.head()

Unnamed: 0,type,posts,IE,NS,TF,JP,processed_post
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,1,1,0,1,moment sportscent top ten play prank ha lifech...
1,ENTP,'I'm finding the lack of me in these posts ver...,0,1,1,0,im find lack post veri alarm sex bore posit of...
2,INTP,'Good one _____ https://www.youtube.com/wat...,1,1,1,0,good one cours say know bless curs doe absolut...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",1,1,1,1,dear enjoy convers day esoter gab natur univer...
4,ENTJ,'You're fired.|||That's another silly misconce...,0,1,1,1,fire anoth silli misconcept approach logic go ...


In [34]:
predictions = rf_pipeline_1.predict(df_test['processed_post'])

In [22]:
print(classification_report(df_test['IE'], predictions))

              precision    recall  f1-score   support

           0       0.34      0.05      0.08      1999
           1       0.77      0.97      0.86      6676

    accuracy                           0.76      8675
   macro avg       0.56      0.51      0.47      8675
weighted avg       0.67      0.76      0.68      8675



In [35]:
print(classification_report(df_test['IE'], predictions))

              precision    recall  f1-score   support

           0       0.24      0.41      0.30      1999
           1       0.77      0.60      0.68      6676

    accuracy                           0.56      8675
   macro avg       0.51      0.51      0.49      8675
weighted avg       0.65      0.56      0.59      8675



In [37]:
lable = [df_test['IE'], df_test['NS'], df_test['TF'], df_test['JP']]