In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
from glob import glob
import logging

import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from Doc2Vec import Doc2Vec, SVMSklearn, TSNESklearn, GensimSVMSklearn
from Constants import SENTIMENTS, TRAINING_DATA, TESTING_DATA



In [3]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [4]:
#####################
# Original review data
#####################
base_dir = os.path.join('data', 'reviews')
pos_dir = os.path.join(base_dir, 'POS')
neg_dir = os.path.join(base_dir, 'NEG')

training_pos_files = glob(os.path.join(pos_dir, 'cv[0-8]*.txt'))
training_neg_files = glob(os.path.join(neg_dir, 'cv[0-8]*.txt'))

testing_pos_files = glob(os.path.join(pos_dir, 'cv9*.txt'))
testing_neg_files = glob(os.path.join(neg_dir, 'cv9*.txt'))

d2v_training_files = [
    *training_pos_files,
    *training_neg_files,
    *testing_pos_files,
    *testing_neg_files
]
d2v_testing_files = []

In [5]:
# ###########
# # IMDB data
# ###########
# base_dir = 'imdb'
# pos_dir = 'pos'
# neg_dir = 'neg'
# unsup_dir = 'unsup'

# train_dir = os.path.join(base_dir, 'train')
# train_pos_dir = os.path.join(train_dir, pos_dir)
# train_neg_dir = os.path.join(train_dir, neg_dir)

# test_dir = os.path.join(base_dir, 'train')
# test_pos_dir = os.path.join(test_dir, pos_dir)
# test_neg_dir = os.path.join(test_dir, neg_dir)

# training_pos_files = glob(os.path.join(train_pos_dir, '*.txt'))
# training_neg_files = glob(os.path.join(train_neg_dir, '*.txt'))
# testing_pos_files = glob(os.path.join(test_pos_dir, '*.txt'))
# testing_neg_files = glob(os.path.join(test_neg_dir, '*.txt'))

# unsup_files = glob(os.path.join(base_dir, train_dir, unsup_dir, '*.txt'))

# d2v_training_files = [
#     *training_pos_files,
#     *training_neg_files,
#     *unsup_files
# ]
# d2v_testing_files = []

In [6]:
y_train = np.array([
    *[SENTIMENTS.pos.review_label]*len(training_pos_files),
    *[SENTIMENTS.neg.review_label]*len(training_neg_files)
])
y_test = np.array([
    *[SENTIMENTS.pos.review_label]*len(testing_pos_files),
    *[SENTIMENTS.neg.review_label]*len(testing_neg_files)
])

In [7]:
gensim_sklearn = GensimSVMSklearn(
    d2v_training_files=d2v_training_files,
    d2v_epochs=100,
    d2v_infer_epochs=50,
    d2v_min_count=5,
    d2v_vector_size=50,
    d2v_window=5,
    d2v_dm=0,
    d2v_dm_concat=0,
    d2v_dbow_words=1
)

In [9]:
gensim_sklearn.train([*training_pos_files, *training_neg_files], y_train)

In [None]:
pipeline_X_train = gensim_sklearn.pipeline.named_steps['doc2vec'].transform([*training_pos_files, *training_neg_files])
pipeline_X_test = gensim_sklearn.pipeline.named_steps['doc2vec'].transform([*testing_pos_files, *testing_neg_files])

In [None]:
gensim_sklearn.test([*training_pos_files, *training_neg_files], y_train)

In [None]:
gensim_sklearn.test([*testing_pos_files, *testing_neg_files], y_test)

In [None]:
gensim_sklearn.cross_validate([*training_pos_files, *training_neg_files], y_train)

In [None]:
# gs_params = {
#     'doc2vec__epochs': (100,),
#     'doc2vec__infer_epochs': (50,),
#     'doc2vec__vector_size': (50,), 
#     'doc2vec__dm': (0,),
#     'doc2vec__dm_concat': (0,),
#     'doc2vec__dbow_words': (1,),
#     'doc2vec__window': (5,10,15,20),
#     'doc2vec__min_count': (5,)
# }
        
# gensim_sklearn.grid_search([*training_pos_files, *training_neg_files], y_train, gs_params)

In [None]:
# pd.DataFrame(gensim_sklearn.gs.cv_results_).to_json()

In [None]:
# gensim_sklearn.gs.best_params_

In [None]:
# gensim_sklearn.gs.best_score_

In [None]:
# with open('.pkl', 'wb') as f:
    # pickle.dump(gensim_sklearn, f)

In [None]:
#########
# Doc2Vec
#########

use_d2v_pickle = False
d2v_pickle_name = 'doc2vec_model.pkl'

if use_d2v_pickle and os.path.isfile(d2v_pickle_name):
    logger.info('Loading pickled d2v model')
    with open(d2v_pickle_name, 'rb') as f:
        d2v = pickle.load(f)
else:
    d2v = Doc2Vec(vector_size=50, epochs=40)

    logger.info('Loading data')
    d2v.load_data(training_files=d2v_training_files, testing_files=d2v_testing_files)

    logger.info('Training doc2vec')
    d2v.train()

    with open(d2v_pickle_name, 'wb') as f:
        pickle.dump(d2v, f)

# logger.info('Testing doc2vec on the training data')
# ranks_count, errors = d2v.test()
# logger.info(ranks_count)

use_embeddings_pickle = False
embeddings_pickle_name = 'doc2vec_embeddings.pkl'

if use_embeddings_pickle and os.path.isfile(embeddings_pickle_name):
    logger.info('Loading pickled embeddings')
    with open(embeddings_pickle_name, 'rb') as f:
        embeddings = pickle.load(f)
else:
    logger.info('Obtaining embeddings')
    embeddings = d2v.generate_embeddings(
        training_pos_files=training_pos_files,
        training_neg_files=training_neg_files,
        testing_pos_files=testing_pos_files,
        testing_neg_files=testing_neg_files
    )

    with open(embeddings_pickle_name, 'wb') as f:
        pickle.dump(embeddings, f)

In [None]:
X_train = np.array([
    *embeddings[TRAINING_DATA][SENTIMENTS.pos.review_label],
    *embeddings[TRAINING_DATA][SENTIMENTS.neg.review_label]
])
X_test = np.array([
    *embeddings[TESTING_DATA][SENTIMENTS.pos.review_label],
    *embeddings[TESTING_DATA][SENTIMENTS.neg.review_label]
])

In [None]:
X_train.shape, pipeline_X_train.shape

In [None]:
#####
# SVC
#####
logger.info('Training SVM with embeddings')
svm = SVMSklearn()
svm.train(X_train, y_train)

logger.info('Testing SVM with embeddings')
svm.cross_validate(X_train, y_train, folds=10)
svm.test(X_train, y_train)
svm.test(X_test, y_test)

logger.info('Training SVM with pipeline embeddings')
svm = SVMSklearn()
svm.train(pipeline_X_train, y_train)

logger.info('Testing SVM with pipeline embeddings')
svm.cross_validate(pipeline_X_train, y_train, folds=10)
svm.test(pipeline_X_train, y_train)
svm.test(pipeline_X_test, y_test)

In [None]:
samps = 5000
X = np.vstack((X_train[:samps], X_train[-samps:], X_test[:samps], X_test[-samps:]))
y = np.hstack((y_train[:samps], y_train[-samps:], y_test[:samps], y_test[-samps:]))

In [None]:
#####
# PCA
#####
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X)
X_pca.shape

In [None]:
#######
# T-SNE
#######
logger.info('Training T-SNE model')

# tsne = TSNESklearn()
# tsne_results = tsne.fit_transform(X)

tsne = TSNE(early_exaggeration=12.0, n_components=2, learning_rate='auto', init='random', verbose=3)
tsne_results = tsne.fit_transform(X)

In [None]:
tsne_results

In [None]:
tsne_df = pd.DataFrame({
    'tsne-3d-one': tsne_results[:,0],
    'tsne-3d-two': tsne_results[:,1],
    # 'tsne-3d-three': tsne_results[:,2],
    'y': y,
    'size': 2*np.ones(len(y))
})

In [None]:
from collections import Counter
Counter(tsne_df.y)

In [None]:
from gensim.sklearn_api import W2VTransformer

In [None]:
px.scatter(tsne_df, x='tsne-3d-one', y='tsne-3d-two', color='y')

In [None]:
px.scatter_3d(tsne_df, x='tsne-3d-one', y='tsne-3d-two', z='tsne-3d-three', color='y', size='size')

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="y",
    # palette=sns.color_palette("hls", 10),
    data=tsne_df,
    legend="full",
    alpha=0.3
)

In [None]:
np.hstack((y_train, y_test))