In [1]:
import pathlib
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.manifold import TSNE



In [10]:
# Load data
data_path = pathlib.Path().cwd().parent / "data"
print(f"Logging info - Loading data from {data_path}")
df = pd.read_csv(f"{data_path}/data.csv", keep_default_na=False)

Logging info - Loading data from /Users/1150704/Documents/mywork/zip_product_classification/data


In [11]:
df.shape

(125344, 8)

In [4]:
# Create new column that combines both product desc and name
df["name_desc"] = df["name"] + " " + df["desc"]

In [5]:
#The Doc2Vec model takes 'tagged_documents'
#tag the training data
tagged_tr = [
    TaggedDocument(
        words=doc.split(),
        tags=[str(i)]) for i, doc in enumerate(df["name_desc"])
]

In [9]:
#Instantiate the model

model = Doc2Vec(vector_size=100, 
                window=5, #change to 8
                alpha=.025, #initial learning rate
                min_alpha=0.00025, #learning rate drops linearly to this
                min_count=2, #ignores all words with total frequency lower than this.
                dm =1, #algorith 1=distributed memory
                workers=8)#cores to use

#build the vocab on the training data
model.build_vocab(tagged_tr)

In [10]:
#max training epochs
max_epochs = 20

#train n epochs and save the model
t1 = time.time()
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch+1))
    model.train(tagged_tr,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
   
   

print("done!")
t2 = time.time()    
model.save("zipdoc2vec.model")
#print("Model Saved")
print("Time: {}".format(t2-t1))

iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
done!
Time: 2145.6852819919586


In [None]:
# Extract vectors from doc2vec model
X = np.array([model.docvecs[str(i)] for i in range(len(tagged_tr))])

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)

In [None]:
tsne_results = tsne.fit_transform(X)

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(
    x=tsne_results[:,0], y=tsne_results[:,1],
    hue=df["cat0"],
    palette=sns.color_palette("hls", 4),
    data=tsne_results,
    legend="full",
    alpha=0.3
)