# Record Embedding

In [3]:
import pandas as pd
import numpy as np
import os
import calendar
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

## Read Data
We will be using a dataset containing book data from Amazon. I scraped this dataset off Amazon. It contains about 3400 tuples of various datatypes.

In [4]:
df = pd.read_csv("amazonCleaned.csv",dtype=object, encoding='utf8')

In [5]:
df.head()

Unnamed: 0,Name,Author,Format,Publisher,Publishing Date
0,Age of Myth: Book One of The Legends of the Fi...,Michael J. Sullivan,Paperback,Del Rey,"January 31, 2017"
1,Rise of the Dragons (Kings and Sorcerers--Book 1),Morgan Rice,Hardcover,Morgan Rice,"August 4, 2017"
2,The Book of Deacon (Volume 1),Joseph Lallo,Kindle,CreateSpace Independent Publishing Platform,"March 18, 2012"
3,A Quest of Heroes: Book #1 in the Sorcerer's Ring,Morgan Rice,Hardcover,Morgan Rice,"December 3, 2012"
4,Fantasia: An Algerian Cavalcade,Dorothy S. Blair,Kindle,Heinemann; 1 edition,"March 15, 1993"


## Exploration using the Gensim library
Details here: https://radimrehurek.com/gensim/

In [6]:
import warnings
warnings.filterwarnings('ignore')

from gensim.models import Word2Vec
import gensim

Convert each tuple into a row

In [7]:
names = df['Name'].tolist()
authors = df['Author'].tolist()
publishers = df['Publisher'].tolist()
dates = df['Publishing Date'].tolist()
formats = df['Format'].tolist()

combined = list(zip(names, authors, publishers, dates, formats))

In [8]:
len(combined)

7604

In [9]:
combined[0]

('Age of Myth: Book One of The Legends of the First Empire',
 'Michael J. Sullivan',
 'Del Rey',
 'January 31, 2017',
 'Paperback')

In [10]:
# train model
bigram_transformer = gensim.models.Phrases(combined, delimiter=b' ')
model = Word2Vec(bigram_transformer[combined], sg=1, min_count=1, workers=8, iter=1000)

In [18]:
print(model)

Word2Vec(vocab=18473, size=100, alpha=0.025)


100 dimensional vectors.

In [19]:
print(model['Tor Fantasy'])

[ 0.67516285 -0.14670508 -0.4897074   0.77409965  0.38929275 -0.4906028
  0.43397602 -0.06233988  0.77981013  0.5924976  -0.8089933  -0.61580753
 -1.4347706  -0.46481085 -0.25025898  0.5557685  -1.2789061   0.18085238
 -1.1410983   1.0432794   0.01697883 -0.7970075  -1.7878596   1.4433059
  0.2528082   0.71720153 -0.12724243 -1.5391577   0.9672115   0.7647959
 -1.1338452   1.0273381   0.5251662  -1.356639   -0.47810143  1.3329961
 -0.3818578  -0.2344442  -0.13924609  0.53028697 -0.6666167  -0.10410766
 -0.78360194 -0.5907514   0.6463207  -0.8608699   1.0243403   0.97872686
 -0.10898726 -0.45420325  1.060158    0.16182332  0.8398038  -0.7350409
 -0.579689    0.44014037 -0.25801885  0.11126943 -0.53362936  1.02085
  0.5980683   0.5375461   0.32991734  0.38085702 -1.3669183  -0.04034464
 -1.4376628  -1.8168751  -0.25210255 -0.1828292  -0.0244973  -0.9850758
  1.2847495   1.336925    0.8740314  -0.28057635  0.47792712 -1.2412843
  0.9257329   0.76902884  0.00628767 -0.32869408 -0.7335433  

In [20]:
model.most_similar("Tor Fantasy")

[('Katherine Addison', 0.7462049722671509),
 ('The Goblin Emperor', 0.7333511114120483),
 ('Seventh Son (Tales of Alvin Maker, Book 1)', 0.7288691997528076),
 ('September 15, 1996', 0.7274661064147949),
 ('Alvin Journeyman (Tales of Alvin Maker, Book 4)', 0.7251750826835632),
 ('June 15, 1993', 0.7179016470909119),
 ('October 15, 1992', 0.7120752930641174),
 ('The Dinosaur Princess (The Dinosaur Lords)', 0.7049572467803955),
 ('House of Chains (The Malazan Book of the Fallen, Book 4)',
  0.703456699848175),
 ('The Dragon Reborn (The Wheel of Time, Book 3)', 0.7030671238899231)]

In [21]:
model.most_similar('March 3, 2015')

[('Borrowed Crime (A Bookmobile Cat Mystery)', 0.8873239755630493),
 ('The Goblin Emperor', 0.862354040145874),
 ('Skin Game (Dresden Files)', 0.8621822595596313),
 ('Katherine Addison', 0.8594257831573486),
 ("The Assassin's Blade: The Throne of Glass Novellas", 0.8532944917678833),
 ('Laurie Cass', 0.8500100374221802),
 ('Words of Radiance: Book Two of the Stormlight Archive', 0.8349869847297668),
 ('Bloomsbury USA Childrens; Reprint edition', 0.7920763492584229),
 ('Pouncing on Murder (A Bookmobile Cat Mystery)', 0.7877581119537354),
 ('Heir of Fire (Throne of Glass)', 0.7686179876327515)]

In [22]:
model.most_similar('The Way of Kings')

[('The Well of Ascension (Mistborn, Book 2)', 0.9306834936141968),
 ('The Hero of Ages: Book Three of Mistborn', 0.9146141409873962),
 ('May 24, 2011', 0.9066956043243408),
 ('Return of the Crimson Guard: A Novel of the Malazan Empire (Novels of the Malazan Empire)',
  0.8936777114868164),
 ('The Alloy of Law: A Mistborn Novel', 0.8878645896911621),
 ('Shadows of Self: A Mistborn Novel', 0.8840540647506714),
 ('Anno Dracula', 0.862244725227356),
 ('Towers of Midnight (Wheel of Time)', 0.858672022819519),
 ('Kim Newman', 0.8546329736709595),
 ('Alden Bell', 0.8533525466918945)]

In [23]:
model.save("amazonModelWord2Vec.w2v")