# **PACKAGES**

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import ast
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
from gensim.test.utils import get_tmpfile
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

%matplotlib inline

pd.set_option('display.max_columns', 500)

# **LOAD DATA**

In [None]:
reviews = pd.read_csv("data/clean_full.csv.gzip", ### path to your dataset
                       compression='gzip',
                       low_memory=False,
                       parse_dates=['review_date', 'review_date_diner'])

In [None]:
reviews.shape

In [None]:
reviews.head()

In [None]:
cap_reviews = reviews[reviews.grp == "cap"]

In [None]:
corpus = cap_reviews.review_content_clean.map(lambda review: ast.literal_eval(review)).tolist()

In [None]:
def finalize_cleaning(reviews):
    reviews = [re.sub(r"\`+", "", word) for word in reviews]
    reviews = [re.sub(r"\.+", ".", word) for word in reviews]
    reviews = [re.sub(r"\.\w", "", word) for word in reviews]
    reviews = [re.sub(r"\'", ".", word) for word in reviews]
    reviews = [word for word in reviews if len(word)>1]
    return reviews

In [None]:
corpus = list(map(lambda review: finalize_cleaning(review), corpus))

# **BUILDING YOUR OWN WORD2VEC**

In [None]:
path = get_tmpfile("word2vec.model")

model = gensim.models.Word2Vec(size=300, window=3, min_count=5, workers=4, seed=1, iter=50)
model.build_vocab(corpus[:3000])
model.train(corpus[:3000], total_examples=model.corpus_count, epochs=model.iter)
model.save("word2vec.model")

In [None]:
model = gensim.models.Word2Vec.load("word2vec.model")

In [None]:
list(model.wv["brrrr"])

In [None]:
model.wv.most_similar("dessert", topn=10)

# **CREATE WORD EMBEDDING OF WORDS**

In [None]:
embedding_matrix = dict()

for word in model.wv.vocab.keys():
    embedding_matrix[word] = list(model.wv[word])
    
embedding_matrix = pd.DataFrame(embedding_matrix)

In [None]:
embedding_matrix.head()

In [None]:
embedding_matrix.shape

# **DISCOVERING KNOWLEDGE WITH EMBEDDING MATRIX**

# **DIMENSION REDUCTION (PCA)**

In [None]:
pca = PCA(n_components=3)
pca.fit(embedding_matrix.T)

In [None]:
np.cumsum(pca.explained_variance_ratio_)

In [None]:
pca_data = pd.DataFrame(pca.transform(embedding_matrix.T))
pca_data.columns = ["PC1", "PC2", "PC3"]

In [None]:
pca_data.head()

In [None]:
pca_data.shape

In [None]:
def label_point(x, y, val, ax):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        ax.text(point['x']+.02, point['y'], str(point['val']))

In [None]:
plt.figure(figsize = (13,9))

sns.scatterplot(pca_data.PC1, pca_data.PC2)
label_point(pca_data.PC1, pca_data.PC2, pd.Series(embedding_matrix.columns), plt.gca())

## **DIMENSION REDUCTION (T-SNE)**

In [None]:
X_embedded = TSNE(n_components=2, perplexity=5, learning_rate=300).fit_transform(embedding_matrix.T)

In [None]:
X_embedded = pd.DataFrame(X_embedded)
X_embedded.columns = ["Dimension1", "Dimension2"]

In [None]:
X_embedded.head()

In [None]:
plt.figure(figsize = (13,9))

sns.scatterplot(X_embedded.Dimension1, X_embedded.Dimension2)
#label_point(X_embedded.Dimension1, X_embedded.Dimension2, pd.Series(embedding_matrix.columns), plt.gca())

# GENERATING THE REVIEWS EMBEDDING MATRIX 

In [None]:
vectors = []
for review_content in corpus:
    review_vector = []
    for word in review_content:
        try:
            review_vector.append(list(model.wv[word]))
        except KeyError:
            pass            
    vectors.append([sum(i) for i in zip(*review_vector)])

In [None]:
review_embedding = pd.DataFrame(vectors)

In [None]:
review_embedding = review_embedding/300

In [None]:
review_embedding.columns = ["Dimension_"+str(i) for i in range(300)]

In [None]:
cap_reviews = pd.concat([cap_reviews, review_embedding], axis=1)

In [None]:
cap_reviews.head()