In [1]:
import numpy as np
import pandas as pd
import string
import re
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = [16,12]

from sklearn import manifold
from itertools import product
from nltk.corpus import stopwords
from scipy.stats import rankdata


In [2]:
# read
books = pd.read_csv('fiction.csv')

In [3]:
# from gensim.models import KeyedVectors
# filename = '/Users/weitinglin/Documents/GoogleNews-vectors-negative300.bin'
# model = KeyedVectors.load_word2vec_format(filename, binary=True)


In [4]:
def trim_string(S):
    '''
    trim useless words if string is too long
    '''
    mystr1 = re.split('[\W\s]+', S)
    # split at punctuation or space
    
    mystr =[s.lower() for s in mystr1] 
    # Remove "the", "a", "an"
    nonsense = ["the", "a", "an", "and", "to","on", "from", "in", "by"]
    mystr = [word for word in mystr if word.lower() not in nonsense]
        
    # remove more
    mystr_less = [word for word in mystr if word.lower() not in stopwords.words('english')]
    
    if len(mystr_less) > 0 :
        mystr = mystr_less
    
    # remove placeholder 
    mystr_less = [s for s in mystr if s in model.vocab]
    if len(mystr_less) > 0 :
        mystr = mystr_less
    
    return mystr

In [5]:
temp = trim_string('Slaughterhouse-Five')
temp

['slaughterhouse', 'five']

In [7]:
def str2mat(instr, limit = 5, placeholder = None):
    '''
    Convert string to a vector base on average vector of the composing words.
    instr: the inpput string
    placeholder: for the non-vocabularies
    '''
    # make a place-holder: mean of three strange words
    if placeholder is None:
        ph = (model.get_vector("Ka_wai") + \
              model.get_vector("Chief_Carla_Piluso") + model.get_vector("Marc_Andre_Bergeron"))/1
    
    mystr = trim_string(instr)
    
    # number of words
    L = min(len(mystr), limit) 
    
    ## padding up
    sheet = np.ones((300, limit))* 2 
    for l in range(L):
        if (mystr[l] in model.vocab):
            sheet[:,l] = model.get_vector(mystr[l])
        else:
            sheet[:,l] = ph
  
    return L, sheet

In [8]:
def compare_mats(M1, M2 , ph = 2):
    n, limit = M1.shape
    L1 = sum(M1[0,:] != 2 ) # lenth of 
    L2 = sum(M2[0,:] != 2 ) 
    # trim 
    M1_trim = M1[:, 0:L1]
    M2_trim = M2[:, 0:L2]
    
    if L1 == 1:
        lin_dist = M2_trim - M1_trim
        euc_dist = [np.sqrt(sum(lin_dist[:,i]  ** 2)) for i in range(L2)]
        dist = min(euc_dist)
    elif L2 == 1:
        lin_dist = np.tile(M2_trim, L1) - M1_trim
        euc_dist = [np.sqrt(sum(lin_dist[:,i]  ** 2)) for i in range(L1)]
        # use mean if target is more than 1 words
        dist = np.mean(euc_dist) 
    else:
        inds = list(product(np.arange(L2), repeat=L1)) # select from M2 to match the size of M1
        eucs = []
        for p, ind in enumerate(inds): # 2, (0,1):
            M2_p = M2_trim[:,list(ind)] # permuted M2'
            lin_dist = M2_p - M1_trim
            euc = [np.sqrt(sum(lin_dist[:,i]  ** 2)) for i in range(L1)]
            eucs.append(np.mean(euc))
        dist = min(eucs)
    return dist

In [9]:
def compare_strs(S1, S2, limit = 5, placeholder = None):
    _, M1 = str2mat(S1, limit = limit)
    _, M2 = str2mat(S2, limit = limit)
    return compare_mats(M1, M2 , ph = placeholder)

In [10]:
compare_strs('dark', 'Dark tower')

0.0

In [11]:
compare_strs('dark tower', 'ddddddd')

5.7817500703577505

In [12]:
compare_strs('dark tower', 'dragonfly in amber')

3.750284740905288

In [14]:
compare_strs('the dark tower', 'black castle')

3.32972583831502

In [15]:
def fuzzy_find2(mytitle, shelf, maxshow = 10, threshhold = 5):
    '''
    mytitle: the user input keyword for fuzzy search
    shelf: df with column named 'title', find book from
    maxshow: the max. number of result return.
    threshhold: threshhold of similarity for the "match"
    '''
    dist = []
    for s in shelf["title"]:
        dist.append(compare_strs(mytitle, s))
    dist = np.array(dist)
    
    fuzzy = np.where(dist < threshhold)[0]
    L = len(fuzzy)
    if L > maxshow:
        rankF = rankdata(dist, method='min') 
        fuzzy = np.where(rankF <= maxshow)[0]
#         print(rankF)
#         print(fuzzy) 
#     return fuzzy
    return shelf["title"][fuzzy], dist[fuzzy]

In [16]:
fuzzy_find2('butterfly in resin', books, maxshow=5)

(14                                      It
 26    The Brief Wondrous Life of Oscar Wao
 32                    Lincoln in the Bardo
 38                        The Tuscan Child
 64                     Dragonfly in Amber 
 Name: title, dtype: object,
 array([3.61952804, 3.44184083, 3.45750671, 3.52358933, 3.57965192]))