In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pickle
from glob import glob
import logging

import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from gensim.models import Word2Vec, Doc2Vec
# import tensorflow as tf

from Doc2Vec import Doc2Vec, GensimSVMSklearn
from Constants import SENTIMENTS, TRAINING_DATA, TESTING_DATA

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

# Data Load

In [None]:
#####################
# Original review data
#####################
# base_dir = os.path.join('data', 'reviews')
# pos_dir = os.path.join(base_dir, 'POS')
# neg_dir = os.path.join(base_dir, 'NEG')

# training_pos_files = glob(os.path.join(pos_dir, 'cv[0-8]*.txt'))
# training_neg_files = glob(os.path.join(neg_dir, 'cv[0-8]*.txt'))

# testing_pos_files = glob(os.path.join(pos_dir, 'cv9*.txt'))
# testing_neg_files = glob(os.path.join(neg_dir, 'cv9*.txt'))

# d2v_training_files = [
#     *training_pos_files,
#     *training_neg_files,
#     *testing_pos_files,
#     *testing_neg_files
# ]
# d2v_testing_files = []

# gensim_sklearn = GensimSVMSklearn(
#     d2v_training_files=d2v_training_files,
#     d2v_epochs=100,
#     d2v_infer_epochs=50,
#     d2v_min_count=5,
#     d2v_vector_size=50,
#     d2v_window=5,
#     d2v_dm=0,
#     d2v_dm_concat=0,
#     d2v_dbow_words=1
# )

In [None]:
###########
# IMDB data
###########
base_dir = 'imdb'
pos_dir = 'pos'
neg_dir = 'neg'
unsup_dir = 'unsup'

train_dir = os.path.join(base_dir, 'train')
train_pos_dir = os.path.join(train_dir, pos_dir)
train_neg_dir = os.path.join(train_dir, neg_dir)

test_dir = os.path.join(base_dir, 'train')
test_pos_dir = os.path.join(test_dir, pos_dir)
test_neg_dir = os.path.join(test_dir, neg_dir)

training_pos_files = glob(os.path.join(train_pos_dir, '*.txt'))
training_neg_files = glob(os.path.join(train_neg_dir, '*.txt'))

testing_pos_files = glob(os.path.join(test_pos_dir, '*.txt'))
testing_neg_files = glob(os.path.join(test_neg_dir, '*.txt'))

unsup_files = glob(os.path.join(base_dir, train_dir, unsup_dir, '*.txt'))

d2v_training_files = [
    *training_pos_files,
    *training_neg_files,
    *testing_pos_files,
    *testing_neg_files,
    *unsup_files
]
d2v_testing_files = []

In [None]:
y_train = np.array([
    *[SENTIMENTS.pos.review_label]*len(training_pos_files),
    *[SENTIMENTS.neg.review_label]*len(training_neg_files)
])
y_test = np.array([
    *[SENTIMENTS.pos.review_label]*len(testing_pos_files),
    *[SENTIMENTS.neg.review_label]*len(testing_neg_files)
])

In [None]:
y_train

# Sklearn Pipeline with Gensim

In [None]:
# gensim_sklearn = GensimSVMSklearn(
#     d2v_training_files=d2v_training_files,
#     d2v_epochs=100,
#     d2v_infer_epochs=50,
#     d2v_min_count=5,
#     d2v_vector_size=50,
#     d2v_window=5,
#     d2v_dm=0,
#     d2v_dm_concat=0,
#     d2v_dbow_words=1
# )

In [None]:
# with open('d2v_imdb.pkl', 'wb') as f:
#     gensim_sklearn = pickle.load(f)

In [None]:
# gensim_sklearn.train([*training_pos_files, *training_neg_files], y_train)

In [None]:
gensim_sklearn.test([*training_pos_files, *training_neg_files], y_train)

In [None]:
gensim_sklearn.test([*testing_pos_files, *testing_neg_files], y_test)

## Cross Validation

In [None]:
# gensim_sklearn.cross_validate([*training_pos_files, *training_neg_files], y_train, folds=10)

## Grid Search

In [None]:
# gs_params = {
#     'doc2vec__epochs': (100,),
#     'doc2vec__infer_epochs': (50,),
#     'doc2vec__vector_size': (50,), 
#     'doc2vec__dm': (0,),
#     'doc2vec__dm_concat': (0,),
#     'doc2vec__dbow_words': (1,),
#     'doc2vec__window': (5,10,15,20),
#     'doc2vec__min_count': (5,)
# }
        
# gensim_sklearn.grid_search([*training_pos_files, *training_neg_files], y_train, gs_params)

In [None]:
# pd.DataFrame(gensim_sklearn.gs.cv_results_).to_json()

In [None]:
# gensim_sklearn.gs.best_params_

In [None]:
# gensim_sklearn.gs.best_score_

## Save Elements to Disk

In [None]:
# with open('.pkl', 'wb') as f:
#     pickle.dump(gensim_sklearn, f)

In [None]:
# gensim_sklearn.pipeline.named_steps['doc2vec'].model.save('doc2vec_model.gensim')

## Generate Embeddings

In [None]:
# training_pos_embeddings = gensim_sklearn.pipeline.named_steps['doc2vec'].transform(training_pos_files)
# training_neg_embeddings = gensim_sklearn.pipeline.named_steps['doc2vec'].transform(training_neg_files)
# testing_pos_embeddings = gensim_sklearn.pipeline.named_steps['doc2vec'].transform(testing_pos_files)
# testing_neg_embeddings = gensim_sklearn.pipeline.named_steps['doc2vec'].transform(testing_neg_files)

# Existing Word Embeddings

In [None]:
wiki_sg = Doc2Vec.load(os.path.join('wiki_sg', 'word2vec.bin'))

In [None]:
wiki_sg

# Original Methods

In [None]:
# #########
# # Doc2Vec
# #########

# use_d2v_pickle = False
# d2v_pickle_name = 'doc2vec_model.pkl'

# if use_d2v_pickle and os.path.isfile(d2v_pickle_name):
#     logger.info('Loading pickled d2v model')
#     with open(d2v_pickle_name, 'rb') as f:
#         d2v = pickle.load(f)
# else:
#     d2v = Doc2Vec(vector_size=50, epochs=40)

#     logger.info('Loading data')
#     d2v.load_data(training_files=d2v_training_files, testing_files=d2v_testing_files)

#     logger.info('Training doc2vec')
#     d2v.train()

#     with open(d2v_pickle_name, 'wb') as f:
#         pickle.dump(d2v, f)

# # logger.info('Testing doc2vec on the training data')
# # ranks_count, errors = d2v.test()
# # logger.info(ranks_count)

# use_embeddings_pickle = False
# embeddings_pickle_name = 'doc2vec_embeddings.pkl'

# if use_embeddings_pickle and os.path.isfile(embeddings_pickle_name):
#     logger.info('Loading pickled embeddings')
#     with open(embeddings_pickle_name, 'rb') as f:
#         embeddings = pickle.load(f)
# else:
#     logger.info('Obtaining embeddings')
#     embeddings = d2v.generate_embeddings(
#         training_pos_files=training_pos_files,
#         training_neg_files=training_neg_files,
#         testing_pos_files=testing_pos_files,
#         testing_neg_files=testing_neg_files
#     )

#     with open(embeddings_pickle_name, 'wb') as f:
#         pickle.dump(embeddings, f)

In [None]:
# X_train = np.array([
#     *embeddings[TRAINING_DATA][SENTIMENTS.pos.review_label],
#     *embeddings[TRAINING_DATA][SENTIMENTS.neg.review_label]
# ])
# X_test = np.array([
#     *embeddings[TESTING_DATA][SENTIMENTS.pos.review_label],
#     *embeddings[TESTING_DATA][SENTIMENTS.neg.review_label]
# ])

In [None]:
# #####
# # SVC
# #####
# logger.info('Training SVM with embeddings')
# svm = SVC()
# svm.train(X_train, y_train)

# logger.info('Testing SVM with embeddings')
# svm.cross_validate(X_train, y_train, folds=10)
# svm.test(X_train, y_train)
# svm.test(X_test, y_test)

# logger.info('Training SVM with pipeline embeddings')
# svm = SVC()
# svm.train(pipeline_X_train, y_train)

# logger.info('Testing SVM with pipeline embeddings')
# svm.cross_validate(pipeline_X_train, y_train, folds=10)
# svm.test(pipeline_X_train, y_train)
# svm.test(pipeline_X_test, y_test)

# Visualisations

## Gensim Instructions

In [None]:
# def reduce_dimensions(model):
#     num_dimensions = 2  # final num dimensions (2D, 3D, etc)

#     # extract the words & their vectors, as numpy arrays
#     vectors = np.asarray(model.wv.vectors)
#     labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

#     # reduce using t-SNE
#     tsne = TSNE(n_components=num_dimensions, random_state=0)
#     vectors = tsne.fit_transform(vectors)

#     x_vals = [v[0] for v in vectors]
#     y_vals = [v[1] for v in vectors]
#     return x_vals, y_vals, labels


# x_vals, y_vals, labels = reduce_dimensions(gensim_sklearn.pipeline.named_steps['doc2vec'].model)

In [None]:
def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

In [None]:
# plot_with_plotly(x_vals, y_vals, labels)

In [None]:
plot_with_matplotlib(x_vals, y_vals, labels)

## Get Word Embeddings

In [None]:
gensim_word_embeddings = gensim_sklearn.pipeline.named_steps['doc2vec'].model[gensim_sklearn.pipeline.named_steps['doc2vec'].model.wv.key_to_index.keys()]

In [None]:
# gensim_sklearn.pipeline.named_steps['doc2vec'].model.wv.save_word2vec_format('gensim_word_embeddings.gensim')

In [None]:
# !python -m gensim.scripts.word2vec2tensor -i gensim_word_embeddings.gensim -o gensim_word_embeddings.tsv

In [None]:
gensim_sklearn.pipeline.named_steps['doc2vec'].model.wv.most_similar('good')

## Lexicon Data

In [None]:
lexicon_dict = {}
with open('data/sent_lexicon', 'r') as f:
    for line in f:
        word = line.split()[2].split("=")[1]
        polarity = line.split()[5].split("=")[1]
        magnitude = line.split()[0].split("=")[1]
        lexicon_dict[word] = [magnitude, polarity]
lexicon_pos_words = [w for w,t in lexicon_dict.items() if t[1] == 'positive' and w in gensim_sklearn.pipeline.named_steps['doc2vec'].model.wv]
lexicon_neg_words = [w for w,t in lexicon_dict.items() if t[1] == 'negative' and w in gensim_sklearn.pipeline.named_steps['doc2vec'].model.wv]
len(lexicon_pos_words), len(lexicon_neg_words)

In [None]:
lexicon_pos_word_embeddings = np.array([gensim_sklearn.pipeline.named_steps['doc2vec'].model.wv[w] for w in lexicon_pos_words])
lexicon_neg_word_embeddings = np.array([gensim_sklearn.pipeline.named_steps['doc2vec'].model.wv[w] for w in lexicon_neg_words])
lexicon_pos_word_embeddings.shape, lexicon_neg_word_embeddings.shape

## PCA

In [None]:
pca_embeddings = gensim_word_embeddings
# pca_embeddings = np.vstack((lexicon_pos_word_embeddings, lexicon_neg_word_embeddings))

pca_2 = PCA(n_components=2)
pca_embeds_2d = pca_2.fit(pca_embeddings)

pca_3 = PCA(n_components=3)
pca_embeds_3d = pca_3.fit(pca_embeddings)

pca_embeds_2d.explained_variance_ratio_, pca_embeds_3d.explained_variance_ratio_

### Lexicon Vis

In [None]:
lexicon_pos_words_pca = pca_embeds_2d.transform(lexicon_pos_word_embeddings)
lexicon_neg_words_pca = pca_embeds_2d.transform(lexicon_neg_word_embeddings)
lexicon_pos_words_pca.shape, lexicon_neg_words_pca.shape

In [None]:
for embed_tuple in [('pos', lexicon_pos_words_pca), ('neg', lexicon_neg_words_pca)]:
    label, embeds = embed_tuple
    plt.scatter(embeds[:,0], embeds[:,1], marker='x', label=label)
plt.legend()

In [None]:
lexicon_pos_words_pca_3d = pca_embeds_3d.transform(np.array([gensim_sklearn.pipeline.named_steps['doc2vec'].model.wv[w] for w in lexicon_pos_words if w in gensim_sklearn.pipeline.named_steps['doc2vec'].model.wv]))
lexicon_neg_words_pca_3d = pca_embeds_3d.transform(np.array([gensim_sklearn.pipeline.named_steps['doc2vec'].model.wv[w] for w in lexicon_neg_words if w in gensim_sklearn.pipeline.named_steps['doc2vec'].model.wv]))

lexicon_pos_words_pca_df = pd.DataFrame(lexicon_pos_words_pca_3d)
lexicon_pos_words_pca_df['sentiment'] = 'pos'

lexicon_neg_words_pca_df = pd.DataFrame(lexicon_neg_words_pca_3d)
lexicon_neg_words_pca_df['sentiment'] = 'neg'

lexicon_pca_df = pd.concat((lexicon_pos_words_pca_df, lexicon_neg_words_pca_df))
lexicon_pca_df['size'] = 0.8

px.scatter_3d(lexicon_pca_df, x=0, y=1, z=2, color='sentiment', size='size')

### Review Visualisation

In [None]:
visualisation_review_ind = 123
visualisation_review_words = [w for w in gensim_sklearn.pipeline.named_steps['doc2vec'].train_corpus[visualisation_review_ind].words if w in gensim_sklearn.pipeline.named_steps['doc2vec'].model.wv]
visualisation_review_word_embeddings = np.array([gensim_sklearn.pipeline.named_steps['doc2vec'].model.wv[w] for w in visualisation_review_words])
visualisation_review_embedding = gensim_sklearn.pipeline.named_steps['doc2vec'].transform([os.path.join(train_pos_dir,gensim_sklearn.pipeline.named_steps['doc2vec'].train_corpus[visualisation_review_ind].tags[0])])
visualisation_words = [*visualisation_review_words, 'review']
visualisation_sentiment = [lexicon_dict.get(w, [None, None])[1] for w in visualisation_review_words]
visualisation_pos_inds = [i for i,x in enumerate(visualisation_sentiment) if x == 'positive']
visualisation_neg_inds = [i for i,x in enumerate(visualisation_sentiment) if x == 'negative']
visualisation_sentiment_inds = [*visualisation_pos_inds, *visualisation_neg_inds]

In [None]:
visualisation_words_pca = pca_2.transform(visualisation_review_word_embeddings)
visualisation_review_pca = pca_2.transform(visualisation_review_embedding)

In [None]:
for lookup_tuple in (('pos', visualisation_pos_inds), ('neg', visualisation_neg_inds)):
    label, lookup_ind = lookup_tuple
    plt.scatter(visualisation_words_pca[lookup_ind,0], visualisation_words_pca[lookup_ind,1], marker='x', label=label)
plt.scatter(visualisation_review_pca[-1,0], visualisation_review_pca[-1,1], marker='x', label='review')
plt.legend()
for word_ind in visualisation_sentiment_inds:
    plt.text(visualisation_words_pca[word_ind,0]+0.02, visualisation_words_pca[word_ind,1]+0.02, visualisation_words[word_ind])

### Pang et al words

In [None]:
pang_positive_words = 'love wonderful best great superb still beautiful'.split()
pang_negative_words = 'bad worst stupid waste boring terrible awful'.split()

In [None]:
pang_positive_word_embeddings = np.array([gensim_sklearn.pipeline.named_steps['doc2vec'].model.wv[w] for w in pang_positive_words])
pang_negative_word_embeddings = np.array([gensim_sklearn.pipeline.named_steps['doc2vec'].model.wv[w] for w in pang_negative_words])
pang_positive_word_embeddings.shape, pang_negative_word_embeddings.shape

In [None]:
pang_positive_word_pca = pca_2.transform(pang_positive_word_embeddings)
pang_negative_word_pca = pca_2.transform(pang_negative_word_embeddings)

In [None]:
pos_doc_ind = 10
neg_doc_ind = 10
pos_doc_pca = pca_2.transform(training_pos_embeddings[pos_doc_ind,:][None,:])[0]
neg_doc_pca = pca_2.transform(training_neg_embeddings[neg_doc_ind,:][None,:])[0]

In [None]:
plt.scatter(pang_positive_word_pca[:,0], pang_positive_word_pca[:,1], marker='x', label='pos')
plt.scatter(pang_negative_word_pca[:,0], pang_negative_word_pca[:,1], marker='x', label='neg')
plt.scatter(pos_doc_pca[0], pos_doc_pca[1], marker='x', label='pos_doc')
plt.scatter(neg_doc_pca[0], neg_doc_pca[1], marker='x', label='neg_doc')
plt.legend()
for word, (x,y) in zip([*pang_positive_words, *pang_negative_words], np.vstack((pang_positive_word_pca, pang_negative_word_pca))):
    plt.text(x+0.05, y+0.05, word)

## TSNE

In [None]:
logger.info('Training T-SNE model')

tsne = TSNE(n_components=2, verbose=3, n_jobs=-1)
# tsne = TSNE(n_components=2, early_exaggeration=12.0, learning_rate='auto', init='random', verbose=3)

tsne_results = tsne.fit_transform(gensim_word_embeddings)

# Dataframe construction
tsne_df = pd.DataFrame({
    'tsne-one': tsne_results[:,0],
    'tsne-two': tsne_results[:,1],
    # 'tsne-three': tsne_results[:,2],
    'word': tsne_words
})
tsne_df.head()


In [None]:
tsne_lexicon_df = tsne_df.copy().loc[tsne_df['word'].isin(lexicon_dict)]
tsne_lexicon_df['lexicon'] = tsne_lexicon_df['word'].map(lexicon_dict)
tsne_lexicon_df[['magnitude','sentiment']] = pd.DataFrame(tsne_lexicon_df['lexicon'].tolist(), index=tsne_lexicon_df.index)
tsne_lexicon_df = pd.concat((tsne_lexicon_df, tsne_df.copy().loc[tsne_df['word'] == 'review']))
tsne_lexicon_df.loc[tsne_lexicon_df['word'] == 'review', 'sentiment'] = 'review'

In [None]:
px.scatter(tsne_lexicon_df , x='tsne-one', y='tsne-two', color='sentiment', hover_name='word')

In [None]:
tsne_pang_df = tsne_df.copy()[tsne_df['word'].isin([*pang_positive_words, *pang_negative_words])]
tsne_pang_df.loc[tsne_pang_df['word'].isin(pang_positive_words), 'sentiment'] = 'pos'
tsne_pang_df.loc[tsne_pang_df['word'].isin(pang_negative_words), 'sentiment'] = 'neg'
tsne_pang_df

In [None]:
px.scatter(tsne_pang_df , x='tsne-one', y='tsne-two', color='sentiment', hover_name='word')

In [None]:
# px.scatter_3d(tsne_df, x='tsne-3d-one', y='tsne-3d-two', z='tsne-3d-three', color='y', size='size')

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(
    x="tsne-one", y="tsne-two",
    hue="sentiment",
    # palette=sns.color_palette("hls", 10),
    data=tsne_pang_df,
    legend="full",
    alpha=0.3
)

In [None]:
np.hstack((y_train, y_test))