In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import seaborn as sns
import re
import pickle

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk import pos_tag

import gensim
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tqdm import tqdm

from textblob import TextBlob

from sklearn.utils import shuffle

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import plot_confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score

from mbti import run_models, preprocess

In [3]:
df = pd.read_csv('./data/cafe_clean_new.csv')

In [4]:
df.dropna(inplace=True)

In [5]:
train_df, test_df = train_test_split(df)

In [6]:
train_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

In [7]:
def tokenize_gens(dataframe, tokens_only=False):
    for i, post in enumerate(dataframe['joined_tokens']):
        tokens = gensim.utils.simple_preprocess(post)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, dataframe.loc[i, 't/f'])

train_corpus = list(tokenize_gens(train_df))
test_corpus = list(tokenize_gens(test_df))

In [8]:
print(train_corpus[:2])

[TaggedDocument(words=['last', 'christmas', 'celebrate', 'entire', 'universe', 'ever', 'movie', 'vendetta', 'valeries', 'letter', 'silver', 'lining', 'playbook', 'dinner', 'eggplant', 'scene', 'rain', 'send', 'lenovo', 'use', 'tapatalk', 'everything', 'make', 'perfect', 'sense', 'head', 'reality', 'doesnt', 'seem', 'catch', 'sent', 'lenovo', 'use', 'tapatalk', 'radiohead', 'placebo', 'depeche', 'mode', 'queen', 'stone', 'age', 'muse', 'eel', 'nick', 'cave', 'nirvana', 'sent', 'lenovo', 'use', 'tapatalk', 'send', 'lenovo', 'use', 'tapatalk', 'well', 'starter', 'free', 'love', 'personally', 'nothing', 'pardon', 'french', 'fuck', 'around', 'open', 'relationship', 'free', 'love', 'right', 'apology', 'enigmatic', 'case', 'whole', 'point', 'get', 'notice', 'let', 'alone', 'someone', 'propose', 'ever', 'even', 'person', 'would', 'soulmate', 'usually', 'conventional', 'way', 'believe', 'intention', 'take', 'consideration', 'seem', 'manipulative', 'surface', 'especially', 'try', 'help', 'someon

In [9]:
print(test_corpus[:2])

[TaggedDocument(words=['ive', 'get', 'skeleton', 'pun', 'mastoid', 'actually', 'theyre', 'little', 'hard', 'come', 'might', 'fib', 'ula', 'xiphoid', 'look', 'today', 'nothing', 'wrong', 'long', 'process', 'today', 'discover', 'favorite', 'garden', 'free', 'student', 'great', 'theyre', 'perfect', 'relax', 'there', 'quiet', 'place', 'full', 'swing', 'call', 'cathedral', 'sittapygmaea', 'thanks', 'ill', 'start', 'small', 'sound', 'good', 'also', 'may', 'mention', 'really', 'like', 'avatar', 'username', 'ive', 'miss', 'see', 'nutchatches', 'lot', 'since', 'move', 'wish', 'could', 'set', 'boundary', 'friend', 'like', 'big', 'pet', 'peeve', 'dont', 'like', 'call', 'stupid', 'even', 'joke', 'way', 'feel', 'like', 'id', 'get', 'call', 'want', 'post', 'regularly', 'keep', 'delete', 'get', 'frustrate', 'every', 'time', 'come', 'back', 'havent', 'write', 'lately', 'either', 'think', 'expression', 'form', 'know', 'suedeswede', 'thought', 'exactly', 'gender', 'feel', 'like', 'weird', 'ritual', 'som

In [10]:
model = gensim.models.doc2vec.Doc2Vec(dm=1, vector_size=50, min_count=2, epochs=40)

In [11]:
model.build_vocab(train_corpus)

In [12]:
print(f"Word 'feel' appeared {model.wv.get_vecattr('feel', 'count')} times in the training corpus.")
print(f"Word 'think' appeared {model.wv.get_vecattr('think', 'count')} times in the training corpus.")


Word 'feel' appeared 22469 times in the training corpus.
Word 'think' appeared 49119 times in the training corpus.


In [13]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [14]:
vector = model.infer_vector(['unaware', 'people', 'dont', 'confidence', 'often', 'picture'])
print(vector)

[-0.11344896 -0.4965007  -0.41206655 -0.13912763  0.17313938 -0.15619417
 -0.20826718  0.2086914  -0.4389367   0.27128014  0.13080177 -0.19327503
  0.1321911   0.03304035 -0.05019431 -0.13650183  0.12333461  0.15993096
 -0.2727517  -0.22526816  0.29956594  0.05439552 -0.13513777 -0.20643973
  0.00316308 -0.07599627  0.04502578  0.3710139   0.33218023 -0.20041418
  0.1851082   0.21061155 -0.16035871  0.22556521 -0.13261831  0.0472893
  0.36776415 -0.28573623  0.39763233 -0.32951465  0.01215485  0.01757574
  0.25727788 -0.03497041 -0.14688474  0.11000869 -0.01370976 -0.12458228
  0.18366544  0.0531315 ]


In [15]:
# for doc_id in range(len(train_corpus)):
#     inferred_vector = model.infer_vector(train_corpus[doc_id].words)
#     sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

In [18]:
lst = [('t', 0.5856667757034302), ('f', 0.3399638831615448)]


<function list.index(value, start=0, stop=9223372036854775807, /)>

In [20]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    print(sims)
    # rank = [docid for docid, sim in sims].index(doc_id)
    # ranks.append(rank)

    # second_ranks.append(sims[1])

ValueError: 0 is not in list

In [21]:
def vector_for_learning(model, posts):
    targets, feature_vectors = zip(*[(doc.tags[0], model.infer_vector(doc.words)) for doc in posts])
    return targets, feature_vectors

In [22]:
y_train, X_train = vector_for_learning(model, train_corpus)
y_test, X_test = vector_for_learning(model, test_corpus)

clf = SGDClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f'Testing accuracy: {accuracy_score(y_test, y_pred)}')

Testing accuracy: 0.7570308898109728


In [23]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f'Testing accuracy: {accuracy_score(y_test, y_pred)}')

Testing accuracy: 0.7570308898109728


In [None]:
model_dbow = Doc2Vec(dm=1, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, alpha=0.025, min_alpha=0.001)
model_dbow.build_vocab([x for x in tqdm(train_corpus)])
train_corpus  = shuffle(train_corpus)
model_dbow.train(train_corpus, total_examples=len(train_corpus), epochs=30)
def vector_for_learning(model, input_docs):
    sents = input_docs
    targets, feature_vectors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, feature_vectors

In [None]:
y_train, X_train = vector_for_learning(model, train_corpus)
y_test, X_test = vector_for_learning(model, test_corpus)

clf = SGDClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f'Testing accuracy: {accuracy_score(y_test, y_pred)}')