In [1]:
import warnings
warnings.filterwarnings('ignore')

from gensim.models.wrappers import FastText
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle

import pandas as pd
import numpy as np
from enum import Enum

Using TensorFlow backend.


In [2]:
df = pd.read_csv("amazonCleaned.csv")

In [3]:
df.head()

Unnamed: 0,Name,Author,Format,Publisher,Publishing Date
0,Age of Myth: Book One of The Legends of the Fi...,Michael J. Sullivan,Paperback,Del Rey,"January 31, 2017"
1,Rise of the Dragons (Kings and Sorcerers--Book 1),Morgan Rice,Hardcover,Morgan Rice,"August 4, 2017"
2,The Book of Deacon (Volume 1),Joseph Lallo,Kindle,CreateSpace Independent Publishing Platform,"March 18, 2012"
3,A Quest of Heroes: Book #1 in the Sorcerer's Ring,Morgan Rice,Hardcover,Morgan Rice,"December 3, 2012"
4,Fantasia: An Algerian Cavalcade,Dorothy S. Blair,Kindle,Heinemann; 1 edition,"March 15, 1993"


In [4]:
path = 'amazonModelWord2Vec.w2v'
word2vecModel = KeyedVectors.load(path)

In [74]:
df[df['Author'] == 'J.K. Rowling']

Unnamed: 0,Name,Author,Format,Publisher,Publishing Date
254,Harry Potter And The Order Of The Phoenix,J.K. Rowling,Paperback,Scholastic Paperbacks,"September 1, 2004"
704,Harry Potter and the Sorcerer's Stone,J.K. Rowling,Hardcover,Scholastic; 1st Edition edition,September 1998
718,Harry Potter and the Prisoner of Azkaban,J.K. Rowling,Kindle,Scholastic Paperbacks,"October 1, 2001"
719,Harry Potter And The Goblet Of Fire,J.K. Rowling,Hardcover,Scholastic Paperbacks,"September 1, 2002"
740,Harry Potter and the Half-Blood Prince (Book 6),J.K. Rowling,Hardcover,Scholastic Paperbacks; Reprint edition,"July 25, 2006"
843,"Harry Potter and the Cursed Child, Parts One a...",J.K. Rowling,Hardcover,Arthur A. Levine Books,"July 25, 2017"


Define attributes

In [7]:
class Attributes(Enum):
    NAME = 1
    AUTHOR = 2
    FORMAT = 3
    PUBLISHER = 4
    PUBLISHING_DATE = 5

In [17]:
attributeMapping = {}
attributeMapping["Name"] = Attributes.NAME
attributeMapping["Author"] = Attributes.AUTHOR
attributeMapping["Format"] = Attributes.FORMAT
attributeMapping["Publisher"] = Attributes.PUBLISHER
attributeMapping["Publishing Date"] = Attributes.PUBLISHING_DATE

In [18]:
attributeMapping

{'Author': <Attributes.AUTHOR: 2>,
 'Format': <Attributes.FORMAT: 3>,
 'Name': <Attributes.NAME: 1>,
 'Publisher': <Attributes.PUBLISHER: 4>,
 'Publishing Date': <Attributes.PUBLISHING_DATE: 5>}

In [19]:
mapping = {0: 'Author', 1: 'Format', 2: 'Name', 3: 'Publisher', 4: 'Publishing Date'}

**Load model**

In [10]:
model = load_model('AmazonClassifier.h5')

**Load Tokenizer**

In [11]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [56]:
def impute(word2vecModel, model, tokenizer, rowWithMissingValue, missingType, topN=15):
    """
    Returns the closest match for the missing attribute value
    """
    output = dict()
    for value in rowWithMissingValue:
        results = word2vecModel.wv.most_similar(value, topn=topN)
        for match, confidence in results:
            # Predicted type to be equal to the missing value
            if predictAttribute(model, tokenizer, match) == missingType:
                if match in output and confidence <= output[match]:
                    continue
                else:
                    output[match] = confidence
    return output.items()

In [80]:
def predictAttribute(model, tokenizer, value):
    """
    Classifies value parameter as its corresponding attribute
    E.g: 'J.K Rowling' -> 'Author'
    """
    value = [value]
    sequences = tokenizer.texts_to_sequences(value)
    testData = pad_sequences(sequences, maxlen=300)
    predictions = model.predict(testData)
    return attributeMapping[mapping[np.argmax(predictions[0])]]

In [81]:
row = ["Harry Potter And The Goblet Of Fire",
       "J.K. Rowling", 
       "Hardcover", 
       "Scholastic Paperbacks", 
       "September 1, 2002"]

**Missing Name**

In [85]:
missingRow = ["J.K. Rowling", 
               "Hardcover", 
               "Scholastic Paperbacks", 
               "September 1, 2002"]

results = impute(word2vecModel, model, tokenizer, missingRow, Attributes.NAME, 5)
results = sorted(results, key=lambda x: x[1], reverse=True)
for i in results:
    print(i)

('Harry Potter And The Goblet Of Fire', 0.990523099899292)
("Harry Potter and the Sorcerer's Stone", 0.9517468810081482)
("Baby-sitters' Island Adventure", 0.9098972082138062)
("Ms. Frizzle's Adventures: Ancient Egypt", 0.889580488204956)
('Over A Spitfire', 0.4875938892364502)
('Final Fantasy VII: Ultimate Strategy Guide : Unofficial (The Final Fantasy Series)', 0.4871237874031067)


<font color='green'>**Name -> Imputation Success**</font>

**Missing Author**

In [89]:
missingRow = ['Harry Potter And The Goblet Of Fire',
              "Hardcover", 
              "Scholastic Paperbacks", 
              "September 1, 2002"]

results = impute(word2vecModel, model, tokenizer, missingRow, Attributes.AUTHOR, 5)
results = sorted(results, key=lambda x: x[1], reverse=True)
for i in results:
    print(i)

('Ann M. Martin', 0.9006907343864441)
('Joanna Cole', 0.8923073410987854)
('Ron Wartow', 0.4968920350074768)
('Dan Birlrw', 0.4789363741874695)


<font color='red'>**Author -> Imputation Failed**</font>

**Missing Format**

In [96]:
missingRow = ['Harry Potter And The Goblet Of Fire',
              "J.K. Rowling", 
              "Scholastic Paperbacks", 
              "September 1, 2002"]

results = impute(word2vecModel, model, tokenizer, missingRow, Attributes.FORMAT, 5)
results = sorted(results, key=lambda x: x[1], reverse=True)
for i in results:
    print(i)

<font color='red'>**Format -> Imputation Failed**</font>

**Missing Publisher**

In [93]:
missingRow = ['Harry Potter And The Goblet Of Fire',
              "J.K. Rowling", 
              "Hardcover",
              "September 1, 2002"]

results = impute(word2vecModel, model, tokenizer, missingRow, Attributes.PUBLISHER, 5)
results = sorted(results, key=lambda x: x[1], reverse=True)
for i in results:
    print(i)

('Scholastic; 1st Edition edition', 0.9685139060020447)
('Sybex Inc', 0.4987725615501404)


<font color='green'>**Publisher -> Imputation Success**</font>

**Missing Publishing Date**

In [95]:
missingRow = ['Harry Potter And The Goblet Of Fire',
              "J.K. Rowling", 
              "Hardcover",
              "Scholastic Paperbacks"]

results = impute(word2vecModel, model, tokenizer, missingRow, Attributes.PUBLISHING_DATE, 5)
results = sorted(results, key=lambda x: x[1], reverse=True)
for i in results:
    print(i)

('September 1, 2002', 0.990523099899292)
('September 1998', 0.9687081575393677)
('September 1, 2004', 0.9233202338218689)
('July 01, 1990', 0.9001783728599548)


<font color='green'>**Publishing Date -> Imputation Success**</font>