# Record Embedding

In [107]:
import pandas as pd
import numpy as np
import os
import calendar
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

In [108]:
df = pd.read_csv('amazonCleaned.csv', dtype=object)

In [109]:
df.head()

Unnamed: 0,Name,Author,Format,Publisher,Publishing Date
0,Age of Myth: Book One of The Legends of the Fi...,Michael J. Sullivan,Paperback,Del Rey,"January 31, 2017"
1,Rise of the Dragons (Kings and Sorcerers--Book 1),Morgan Rice,Hardcover,Morgan Rice,"August 4, 2017"
2,The Book of Deacon (Volume 1),Joseph Lallo,Kindle,CreateSpace Independent Publishing Platform,"March 18, 2012"
3,A Quest of Heroes: Book #1 in the Sorcerer's Ring,Morgan Rice,Hardcover,Morgan Rice,"December 3, 2012"
4,Fantasia: An Algerian Cavalcade,Dorothy S. Blair,Kindle,Heinemann; 1 edition,"March 15, 1993"


## Embedding using fastText
Details here: https://fasttext.cc/

In [110]:
import warnings
warnings.filterwarnings('ignore')

import gensim
from gensim.models import FastText

Convert each tuple into a row

In [111]:
names = df['Name'].tolist()
authors = df['Author'].tolist()
publishers = df['Publisher'].tolist()
dates = df['Publishing Date'].tolist()
formats = df['Format'].tolist()

combined = list(zip(names, authors, publishers, dates, formats))

In [112]:
len(combined)

7604

In [113]:
combined[0]

('Age of Myth: Book One of The Legends of the First Empire',
 'Michael J. Sullivan',
 'Del Rey',
 'January 31, 2017',
 'Paperback')

In [114]:
#train fasttext - cbow
model_fast = FastText(combined, min_count=1, workers=8, iter=1000)

In [115]:
print(model_fast)

FastText(vocab=18405, size=100, alpha=0.025)


In [116]:
print(model_fast['Tor Fantasy'])

[ -5.412458     2.991579     3.6252203   -3.4196808   -3.2066329
   0.59749943  -7.8482447   -6.114672    -2.0932353    0.48999414
  -1.4339309   -7.7959886   -4.411901     1.8542625    3.0994992
   5.2746763    8.54603     -4.3231454    2.2175345    5.556088
   0.89939606   0.8524798    2.3908517    7.5518627   -7.0022964
  -4.3341837    8.020583     1.6515157   -4.1988487    7.8810825
  -6.351301    -4.2884765   -1.994357    -1.2601101    0.39927036
  -1.4213387    1.0747572   -5.604467    -1.6319239    5.250094
  -4.0830708    4.6709714   -1.3383621   -6.519044     2.0623097
   3.1602454    0.6102621   -5.0808983    0.19365023   8.531117
  -7.2033005   -6.0466194   -1.9076993    0.32737267   0.8864784
 -10.47367     -2.0826094   -7.7257543    1.3889396   -0.89720356
  -1.4051613    6.665574     3.2960322   -0.7019844    2.4519923
  -0.8111372    0.4013826   -1.9081339   -0.2904086    6.0187597
  -0.48841384  -4.3694572    5.6835113    4.7482963   -3.5096126
  -1.5174493   -7.9249973

In [117]:
model_fast.most_similar("Harry Potter And The Goblet Of Fire")

[('Harry Potter And The Order Of The Phoenix', 0.9595365524291992),
 ('Harry Potter And The Chamber Of Secrets', 0.9494017958641052),
 ('Harry Potter And The Philosopher´s Stone (Film): Fantasy, Adventure, Harry Potter And The Philosopher´s Stone, J. K. Rowling, Chris Columbus (Filmmaker), ... Series), Harry Potter (Character), Hogwarts',
  0.8871390223503113),
 ('Robotech: The Masters Saga: The Southern Cross (Vol 7-9)',
  0.8160675168037415),
 ('The Deathly Hallows Lectures: The Hogwarts Professor Explains the Final Harry Potter Adventure',
  0.8022780418395996),
 ('The Cricket in Times Square (Chester Cricket and His Friends)',
  0.7925834655761719),
 ('The Dark Tower III: The Waste Lands', 0.786429762840271),
 ('Thea Stilton', 0.7830697298049927),
 ('Harry Potter and the Deathly Hallows (Book 7)', 0.7822356224060059),
 ('The Ruling Sea (Chathrand Voyage)', 0.7815117835998535)]

In [118]:
model_fast.most_similar("Bloomsbury")

[('Bloomsbury USA', 0.9679713249206543),
 ('Bloomsbury Sigma', 0.9653311967849731),
 ('Bloomsbury Academic', 0.9589107632637024),
 ('Bloomsbury USA Childrens', 0.8553447723388672),
 ('Bloomsbury Childrens', 0.8333024978637695),
 ('Bloomsbury USA Childrens; Reprint edition', 0.7285604476928711),
 ('Bloomsbury USA Childrens; Reissue edition', 0.7193355560302734),
 ("Bloomsbury Children's Books", 0.6848500370979309),
 ('Bloomsbury USA Childrens; 1st Printing edition', 0.6526287794113159),
 ('Amy Bloom', 0.5729068517684937)]

In [119]:
model_fast.most_similar('March 3, 2015')

[('March 2, 2015', 0.9903461933135986),
 ('March 5, 2015', 0.9897740483283997),
 ('March 13, 2015', 0.9897267818450928),
 ('March 16, 2015', 0.9843902587890625),
 ('March 02, 2015', 0.9828610420227051),
 ('March 24, 2015', 0.982369065284729),
 ('March 12, 2015', 0.9816089868545532),
 ('March 19, 2015', 0.9802409410476685),
 ('March 05, 2015', 0.9796395897865295),
 ('March 10, 2015', 0.9778379201889038)]

In [120]:
model_fast.most_similar('The Way of Kings')

[('The Crown of Kings', 0.949828028678894),
 ('The Rings of Kether', 0.9315294027328491),
 ('The Deadly Curse of Toco-Rey', 0.9219759702682495),
 ('The End of Infinity', 0.9177632331848145),
 ('The Kingdom of America', 0.9174216985702515),
 ('The Horn of Time', 0.9123154282569885),
 ('The Heritage of Hastur', 0.9110957980155945),
 ('The Haunting of Maddy Clare', 0.908108651638031),
 ('The Tamuli: Domes of Fire - The Shining Ones - The Hidden City',
  0.9065176844596863),
 ('The Wizard of Oz', 0.9021756649017334)]

In [121]:
model_fast.most_similar("Brandon Sanderson")

[('Brandon Mull', 0.8694670796394348),
 ('Carl Anderson', 0.8594995737075806),
 ('Elias Anderson', 0.8510757684707642),
 ('Leroy Anderson', 0.8484483957290649),
 ('D A Anderson', 0.8482165336608887),
 ('Poul Anderson', 0.8454199433326721),
 ('Branded', 0.8395313620567322),
 ('Brandon Rospond', 0.8353612422943115),
 ('Derek Anderson', 0.8353233933448792),
 ('Mal Sanders', 0.8350759744644165)]

In [122]:
model_fast.save("amazonModelFastText.w2v")