In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
sns.set_style('darkgrid')

In [4]:
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [5]:
df = pd.read_csv('dataFiles/toy_data.csv')
df.head()

Unnamed: 0,type,posts,IE,NS,TF,JP
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,1,1,0,1
1,INTP,'Good one _____ https://www.youtube.com/wat...,1,1,1,0
2,INTJ,"'Dear INTP, I enjoyed our conversation the o...",1,1,1,1
3,INTJ,'18/37 @.@|||Science is not perfect. No scien...,1,1,1,1
4,INFJ,"'No, I can't draw on my own nails (haha). Thos...",1,1,0,1


In [6]:
# processing the data
def preprocess_string(x_str, return_joined=True):
    '''Returns a cleaned string specifically from the MBTI dataset.
    If return_joined is True, the tokens are joined into a single string so
    that it can be passed into SciKit learn's frequency counter. Otherwise
    the tokens are returned as a list.'''

    # lower
    x_str = x_str.lower()

    # remove |||
    x_str = re.sub("[]|||[]", " ", x_str)

    # remove http links
    x_str = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', x_str)

    # remove puncuation
    x_str = "".join([ci for ci in x_str if ci not in string.punctuation])

    # tokenise
    tokens = nltk.word_tokenize(x_str)

    # stem
    #porter = nltk.PorterStemmer()
    #stemmed_tokens = [porter.stem(token) for token in tokens]
    # lemmatize
    lemm = WordNetLemmatizer()
    stemmed_tokens = [lemm.lemmatize(token) for token in tokens]

    # remove stop words
    stopped_tokens = [ti for ti in stemmed_tokens if ti not in stopwords.words("english")]

    # remove MBTI types
    MBTI_types = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
                  'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ']
    MBTI_types = [ti.lower() for ti in MBTI_types]

    final_tokens = [wi for wi in stopped_tokens if wi not in MBTI_types]

    if return_joined:
        return " ".join([ci for ci in final_tokens])

    return final_tokens

In [7]:
df['processed_post'] = df['posts'].apply(lambda x:preprocess_string(x,True))

In [8]:
df.head()

Unnamed: 0,type,posts,IE,NS,TF,JP,processed_post
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,1,1,0,1,moment sportscenter top ten play prank ha life...
1,INTP,'Good one _____ https://www.youtube.com/wat...,1,1,1,0,good one course say know thats blessing curse ...
2,INTJ,"'Dear INTP, I enjoyed our conversation the o...",1,1,1,1,dear enjoyed conversation day esoteric gabbing...
3,INTJ,'18/37 @.@|||Science is not perfect. No scien...,1,1,1,1,1837 science perfect scientist claim scientifi...
4,INFJ,"'No, I can't draw on my own nails (haha). Thos...",1,1,0,1,cant draw nail haha done professional nail yes...


## Train Test Split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X = df['processed_post']
y = df['type']

y_IE = df['IE']
y_NS = df['NS']
y_TF = df['TF']
y_JP = df['JP']

In [11]:
X_train, X_test, y_ie_train, y_ie_test = train_test_split(X, y_IE, test_size=0.3, random_state = 101)
X_train, X_test, y_ns_train, y_ns_test = train_test_split(X, y_NS, test_size=0.3, random_state = 101)
X_train, X_test, y_tf_train, y_tf_test = train_test_split(X, y_TF, test_size=0.3, random_state = 101)
X_train, X_test, y_jp_train, y_jp_test = train_test_split(X, y_JP, test_size=0.3, random_state = 101)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

In [13]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score