In [78]:
import numpy as np
import pandas as pd
import string
import re
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = [16,12]

# from sklearn import manifold
from itertools import product, combinations
from nltk.corpus import stopwords
# from scipy.stats import rankdata


In [2]:
# read
books = pd.read_csv('fiction.csv')

In [79]:
from gensim.models import KeyedVectors, Phrases
filename = '/Users/weitinglin/Documents/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)


In [5]:
# partial model
# preload
# data base
# machine
# vec = model.vectors


# len(model.vocab)

In [8]:
def trim_string(S):
    '''
    trim useless words if string is too long
    '''
    mystr1 = re.split('[\W\s]+', S)
    # split at punctuation or space
    
    mystr =[s.lower() for s in mystr1] 
    # Remove "the", "a", "an"
    nonsense = ["the", "a", "an", "and", "to","on", "from", "in", "by"]
    mystr = [word for word in mystr if word.lower() not in nonsense]
        
    # remove more
    mystr_less = [word for word in mystr if word.lower() not in stopwords.words('english')]
    
    if len(mystr_less) > 0 :
        mystr = mystr_less
    
    # remove placeholder 
    mystr_less = [s for s in mystr if s in model.vocab]
    if len(mystr_less) > 0 :
        mystr = mystr_less
    
    return mystr

In [9]:
temp = trim_string('Slaughterhouse-Five')
temp

['slaughterhouse', 'five']

In [10]:
def str2mat(instr, limit = 5, placeholder = None):
    '''
    Convert string to a vector base on average vector of the composing words.
    instr: the inpput string
    placeholder: for the non-vocabularies
    '''
    # make a place-holder: mean of three strange words
    if placeholder is None:
        ph = (model.get_vector("Ka_wai") + \
              model.get_vector("Chief_Carla_Piluso") + model.get_vector("Marc_Andre_Bergeron"))/1
    
    mystr = trim_string(instr)
    
    # number of words
    L = min(len(mystr), limit) 
    
    ## padding up
    sheet = np.ones((300, limit))* 2 
    for l in range(L):
        if (mystr[l] in model.vocab):
            sheet[:,l] = model.get_vector(mystr[l])
        else:
            sheet[:,l] = ph
  
    return L, sheet

In [70]:
def compare_mats(M1, M2 , ph = 2, stress = 0.2, penalty = 0.5):
    n, limit = M1.shape
    L1 = sum(M1[0,:] != 2 ) # lenth of 
    L2 = sum(M2[0,:] != 2 ) 
    # trim 
    M1_trim = M1[:, 0:L1]
    M2_trim = M2[:, 0:L2]
    
    if L1 == 1:
        lin_dist = M2_trim - M1_trim
        euc_dist = [np.sqrt(sum(lin_dist[:,i]  ** 2)) for i in range(L2)]
        dist = min(euc_dist)
    elif L2 == 1:
        lin_dist = np.tile(M2_trim, L1) - M1_trim
        euc_dist = [np.sqrt(sum(lin_dist[:,i]  ** 2)) for i in range(L1)]
        # use mean if target is more than 1 words
        dist = np.mean(euc_dist) 
    else:
        # 
        ind_product = list(product(np.arange(L2), repeat=L1)) # select from M2 to match the size of M1
        ind_combination = list(combinations(np.arange(L2), L1)) 
        eucs = []
        for p, ind in enumerate(ind_product): # 2, (0,1):
            M2_p = M2_trim[:,list(ind)] # permuted M2'
            lin_dist = M2_p - M1_trim
            euc = [np.sqrt(sum(lin_dist[:,i]  ** 2)) for i in range(L1)]
            mean_euc = np.mean(euc)
            if ind not in ind_combination:
                mean_euc = mean_euc * (stress + 1)
            eucs.append(mean_euc)
        dist = min(eucs)
    # penalty for unequal list
    if L2 > L1:
        dist = dist +  ((L2 - L1)/(L1 + L2) * penalty)
    return dist

In [71]:
def compare_strs(S1, S2, limit = 5, placeholder = None):
    _, M1 = str2mat(S1, limit = limit)
    _, M2 = str2mat(S2, limit = limit)
    return compare_mats(M1, M2 , ph = placeholder)

In [72]:
compare_strs('dark', 'Dark tower')

0.16666666666666666

In [73]:
compare_strs('dark tower', 'ddddddd')

5.7817500703577505

In [74]:
compare_strs('dark tower', 'dragonfly in amber')# 3.75

4.131742691043174

In [75]:
compare_strs('the dark tower', 'black castle')

3.32972583831502

In [76]:
def fuzzy_find2(mytitle, shelf, maxshow = 10, threshhold = 5):
    '''
    mytitle: the user input keyword for fuzzy search
    shelf: df with column named 'title', find book from
    maxshow: the max. number of result return.
    threshhold: threshhold of similarity for the "match"
    '''
    dist = []
    for s in shelf["title"]:
        dist.append(compare_strs(mytitle, s))
    dist = np.array(dist)
    
    fuzzy = np.where(dist < threshhold)[0]
    L = len(fuzzy)
    if L > maxshow:
        rankF = rankdata(dist, method='min') 
        fuzzy = np.where(rankF <= maxshow)[0]

    return shelf["title"][fuzzy], dist[fuzzy]

In [77]:
fuzzy_find2('butterfly in resin', books, maxshow=5)


(10                 See Me
 14                     It
 54               Still Me
 64    Dragonfly in Amber 
 93           We Are Water
 Name: title, dtype: object,
 array([3.78670202, 3.61952804, 3.76593499, 3.57965192, 3.8536134 ]))

In [82]:
fuzzy_find2('Dark castle', books, maxshow=6)

(14                             It
 48               The Dark Tower I
 54                       Still Me
 57    All the Light We Cannot See
 59        The Women in the Castle
 73             At Home in Mitford
 Name: title, dtype: object,
 array([3.08585741, 1.96243112, 3.17527289, 3.19249795, 1.70018074,
        3.21407874]))

## to do
bring me to the link

## notes:
update str2mat so the order matters

## Build a smaller model
#### 1. get a list of common words

In [80]:

file = "/Users/weitinglin/Documents/google-20000-english-usa.txt"
with open(file, 'r') as f:
    x = f.readlines()

common = [w.strip() for w in x if w.strip() != ""]

In [4]:
# file = "/Users/weitinglin/Documents/wiki-100k.txt"
# with open(file, 'r') as f:
#     x = f.readlines()

# morecommon = [w.strip() for w in x if w.strip() != ""]

### make my own dictionary

In [None]:
fiction = pd.read_csv('fiction_500.csv')

In [None]:
def make_dictionary(books):
    temp = []
    for b in books:
        temp = temp + re.split('[\W\s]+', b.lower()) 
    return set(temp)

In [97]:
dict1 = list(make_dictionary(fiction['title']))

{'title'}

In [100]:
more_common = set(common + dict1)

In [101]:
### make a dictionary
common_dict = {}
for w in more_common:
    if w in model.vocab:
        common_dict[w] = model.get_vector(w)

In [102]:
import dill
dill.dump(common_dict, open("morecommon_dict.pkl","wb"))

In [None]:
# all the above only have to do once

In [38]:
# c_dict = dill.load( open("common_dict.pkl","rb"))

## mock up the web app

In [103]:
import numpy as np
import pandas as pd
import string
import re
import dill
# import matplotlib.pyplot as plt
# %matplotlib inline
# plt.rcParams["figure.figsize"] = [16,12]

# from sklearn import manifold
from itertools import product, combinations
from nltk.corpus import stopwords
from scipy.stats import rankdata

In [105]:
c_dict = dill.load( open("morecommon_dict.pkl","rb"))# 20k + my_dict
# c_dict = dill.load( open("common_dict.pkl","rb"))
vocab = c_dict.keys()
books = pd.read_csv('fiction.csv')

In [106]:
len(vocab)

18883

### the string trimming function

In [107]:
def trim_string(S):
    '''
    trim useless words if string is too long
    '''
    mystr1 = re.split('[\W\s]+', S)
    # split at punctuation or space
    
    mystr =[s.lower() for s in mystr1] 
    # Remove "the", "a", "an"
    nonsense = ["the", "a", "an", "and", "to","on", "from", "in", "by"]
    mystr = [word for word in mystr if word.lower() not in nonsense]
        
    # remove more
    mystr_less = [word for word in mystr if word.lower() not in stopwords.words('english')]
    
    if len(mystr_less) > 0 :
        mystr = mystr_less
    
    # remove placeholder 
    mystr_less = [s for s in mystr if s in vocab]
    if len(mystr_less) > 0 :
        mystr = mystr_less
        
    return mystr

In [108]:
temp = trim_string('Slaughterhouse-Five')
temp

['slaughterhouse', 'five']

In [109]:
trim_string('dragonfly in amber')

['dragonfly', 'amber']

In [110]:
len(vocab)

18883

In [111]:
c_dict['banana'].shape

(300,)

In [112]:
def str2mat(instr, limit = 5, placeholder = None):
    '''
    Convert string to a vector base on average vector of the composing words.
    instr: the inpput string
    placeholder: for the non-vocabularies
    '''
    # make a place-holder: mean of three strange words
    if placeholder is None:
        ph = np.ones(300)* 3
    
    mystr = trim_string(instr)
    
    # number of words
    L = min(len(mystr), limit) 
    
    ## padding up
    sheet = np.ones((300, limit))* 2 
    for l in range(L):
        if (mystr[l] in vocab):
            sheet[:,l] = c_dict[mystr[l]]
        else:
            sheet[:,l] = ph
  
    return L, sheet

In [113]:
str2mat('dragonfly in amber')

(2, array([[ 3.44238281e-02,  1.77764893e-03,  2.00000000e+00,
          2.00000000e+00,  2.00000000e+00],
        [ 8.98437500e-02, -3.51562500e-02,  2.00000000e+00,
          2.00000000e+00,  2.00000000e+00],
        [-2.83203125e-01,  1.12304688e-01,  2.00000000e+00,
          2.00000000e+00,  2.00000000e+00],
        ...,
        [-5.51757812e-02,  1.16699219e-01,  2.00000000e+00,
          2.00000000e+00,  2.00000000e+00],
        [-4.98046875e-02,  2.08007812e-01,  2.00000000e+00,
          2.00000000e+00,  2.00000000e+00],
        [-2.45666504e-03, -1.25000000e-01,  2.00000000e+00,
          2.00000000e+00,  2.00000000e+00]]))

In [114]:
def compare_mats(M1, M2 , ph = 2, stress = 0.2, penalty = 0.5):
    n, limit = M1.shape
    L1 = sum(M1[0,:] != 2 ) # lenth of 
    L2 = sum(M2[0,:] != 2 ) 
    # trim 
    M1_trim = M1[:, 0:L1]
    M2_trim = M2[:, 0:L2]
    
    if L1 == 1:
        lin_dist = M2_trim - M1_trim
        euc_dist = [np.sqrt(sum(lin_dist[:,i]  ** 2)) for i in range(L2)]
        dist = min(euc_dist)
    elif L2 == 1:
        lin_dist = np.tile(M2_trim, L1) - M1_trim
        euc_dist = [np.sqrt(sum(lin_dist[:,i]  ** 2)) for i in range(L1)]
        # use mean if target is more than 1 words
        dist = np.mean(euc_dist) 
    else:
        # 
        ind_product = list(product(np.arange(L2), repeat=L1)) # select from M2 to match the size of M1
        ind_combination = list(combinations(np.arange(L2), L1)) 
        eucs = []
        for p, ind in enumerate(ind_product): # 2, (0,1):
            M2_p = M2_trim[:,list(ind)] # permuted M2'
            lin_dist = M2_p - M1_trim
            euc = [np.sqrt(sum(lin_dist[:,i]  ** 2)) for i in range(L1)]
            mean_euc = np.mean(euc)
            if ind not in ind_combination:
                mean_euc = mean_euc * (stress + 1)
            eucs.append(mean_euc)
        dist = min(eucs)
    # penalty for unequal list
    if L2 > L1:
        dist = dist +  ((L2 - L1)/(L1 + L2) * penalty)
    return dist

In [115]:
def compare_strs(S1, S2, limit = 5, placeholder = None):
    _, M1 = str2mat(S1, limit = limit)
    _, M2 = str2mat(S2, limit = limit)
    return compare_mats(M1, M2 , ph = placeholder)

In [116]:
def fuzzy_find2(mytitle, shelf, maxshow = 10, threshhold = 5):
    '''
    mytitle: the user input keyword for fuzzy search
    shelf: df with column named 'title', find book from
    maxshow: the max. number of result return.
    threshhold: threshhold of similarity for the "match"
    '''
    dist = []
    for s in shelf["title"]:
        dist.append(compare_strs(mytitle, s))
    dist = np.array(dist)
    
    fuzzy = np.where(dist < threshhold)[0]
    L = len(fuzzy)
    if L > maxshow:
        rankF = rankdata(dist, method='min') 
        fuzzy = np.where(rankF <= maxshow)[0]

#     return shelf["title"][fuzzy], dist[fuzzy]
    return list(shelf["title"][fuzzy])

In [117]:
b = fuzzy_find2('Butterfly in resin', books, maxshow=5)

In [118]:
type(b)

list

In [119]:
b

['See Me', 'It', 'Still Me', 'Dragonfly in Amber ', 'We Are Water']

## todo
* data-specific dictionary
* trim ()/audio colection
* penalty to stopwords


In [120]:
fiction = pd.read_csv('fiction_500.csv')

In [121]:
def make_dictionary(books):
    temp = []
    for b in books:
        temp = temp + re.split('[\W\s]+', b.lower()) 
    return set(temp)

In [122]:
my_dict = list(make_dictionary(fiction))