In [222]:

from os.path import join
import pandas as pd 
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
#pip install sparse_dot_topn
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct  # Leading Juice for us
import time
from scipy.sparse import rand
from sparse_dot_topn import awesome_cossim_topn
## reading in data
lefttable = pd.read_csv(join('data', "ltable.csv"))
righttable = pd.read_csv(join('data', "rtable.csv"))
# for i,x in lefttable.iterrows():
#     print(x['title'])
    
train = pd.read_csv(join('data', "train.csv"))
# print(train.head())

In [224]:
def ngrams(string, n = 3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]
# Testing ngrams work for verification
# print('All 3-grams in "Deluxroom":')
# ngrams('visioneer w120-wu roadwarrior sheetfed scanner')

In [24]:
room_types = lefttable['title']
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(room_types)




In [25]:
def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)
    ct.sparse_dot_topn(
    M, N, np.asarray(A.indptr, dtype=idx_dtype),
    np.asarray(A.indices, dtype=idx_dtype),
    A.data,
    np.asarray(B.indptr, dtype=idx_dtype),
    np.asarray(B.indices, dtype=idx_dtype),
    B.data,
    ntop,
    lower_bound,
    indptr, indices, data)
    return csr_matrix((data,indices,indptr),shape=(M,N))

In [26]:
#  Top 10 with similarity above 0.8
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)
t = time.time()-t1
print("SELFTIMED:", t)

SELFTIMED: 0.0848851203918457


In [27]:
# unpacks the resulting sparse matrix
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [229]:
# leftwords = ["gogle","bing","amazn","facebook","fcbook","abbasasdfzz","zsdfzl"]

# rightwords = ["google","bing","amazon","facebook"]

# print (df_dirty["name"])
# print (df_clean["name"])


leftwords = lefttable.iloc[:3000]['title'].tolist()
rightwords = righttable.iloc[:3000]['title'].tolist()
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix_clean = vectorizer.fit_transform(leftwords)
tf_idf_matrix_dirty = vectorizer.transform(rightwords)

t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix_dirty, tf_idf_matrix_clean.transpose(), 1, 0)
t = time.time()-t1
print("SELFTIMED:", t)

matches_df = get_matches_df(matches, leftwords, rightwords, top=0)
# matches_df = matches_df[matches_df['similairity'] < 0.99999]# For removing all exact matches
# matches_df = matches_df[matches_df['similairity'] > 0.6] 
matches_df = matches_df.sort_values(by = ['similairity'], ascending = False)
# matches_df
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     print(matches_df)
    # printing the matches in sorted order
#     matches_df.sort_values(['similairity'], ascending=False).head(10)
#     print(matches_df.head())

SELFTIMED: 0.08464384078979492


In [113]:
def get_matches_df(sparse_matrix, A, B, top=100):
    non_zeros = sparse_matrix.nonzero()

    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]

    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size

    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)

    for index in range(0, nr_matches):
        left_side[index] = A[sparserows[index]]
        right_side[index] = B[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]

    return pd.DataFrame({'left_side': left_side,
                         'right_side': right_side,
                         'similairity': similairity})

df_dirty = {"name":["gogle","bing","amazn","facebook","fcbook","abbasasdfzz","zsdfzl", 'maxell couleur series ear buds purple']}

df_clean = {"name":["google","bing","amazon","facebook", 'ooma telo voip phone system base']}

print (df_dirty["name"])
print (df_clean["name"])

vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix_clean = vectorizer.fit_transform(df_clean['name'])
tf_idf_matrix_dirty = vectorizer.transform(df_dirty['name'])

t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix_dirty, tf_idf_matrix_clean.transpose(), 1, 0)
t = time.time()-t1
print("SELFTIMED:", t)

matches_df = get_matches_df(matches, df_dirty['name'], df_clean['name'], top=0)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(matches_df)

['gogle', 'bing', 'amazn', 'facebook', 'fcbook', 'abbasasdfzz', 'zsdfzl', 'maxell couleur series ear buds purple']
['google', 'bing', 'amazon', 'facebook', 'ooma telo voip phone system base']
SELFTIMED: 0.0003390312194824219
     left_side                        right_side  similairity
0        gogle                            google     0.707107
1         bing                              bing     1.000000
2        amazn                            amazon     0.707107
3     facebook                          facebook     1.000000
4       fcbook                          facebook     0.577350
5  abbasasdfzz  ooma telo voip phone system base     0.182574


In [347]:
from collections import Counter
from math import *


def word2vec(word):
    # Count the number of characters in each word.
    count_characters = Counter(word)
    # Gets the set of characters and calculates the "length" of the vector.
    set_characters = set(count_characters)
    length = sqrt(sum(c*c for c in count_characters.values()))
    return count_characters, set_characters, length, word


def cosine_similarity(vector1, vector2, ndigits):
    
    # Get the common characters between the two character sets
    common_characters = vector1[1].intersection(vector2[1])
    # Sum of the product of each intersection character.
    product_summation = sum(vector1[0][character] * vector2[0][character] for character in common_characters)
    # Gets the length of each vector from the word2vec output.
    length = vector1[2] * vector2[2]
    # Calculates cosine similarity and rounds the value to ndigits decimal places.
    if length == 0:
        # Set value to 0 if word is empty.
        similarity = 0
    else:
        similarity = round(product_summation/length, ndigits)
    return similarity




def find_similar(id_list, full_names_list, similarity_threshold, ndigits):
    # Initiate an empty list to store results.
    results_list = []
    # Apply word2vec function to each name and store them in a list.
    vector_list = [word2vec(str(i)) for i in full_names_list]
    # Two loops to compare each vector with another vector only once.
    for i in range(len(vector_list)):
        # Get first vector
        vector1 = vector_list[i]
        for j in range(i+1, len(vector_list)):
            # Get the next vector
            vector2 = vector_list[j]
            # Calculate cosine similarity
            similarity_score = cosine_similarity(vector1, vector2, ndigits)
            # Append to results list if similarity score is between 1 and the threshold.
            # Note that scores of 1 can be ignored here if we want to exclude people with the same name.
            if 1 >= similarity_score >= similarity_threshold:
                results_list.append([id_list[i], id_list[j], vector1[3], vector2[3], similarity_score])
            else:
                pass
    # Convert list to dataframe.
    results_df = pd.DataFrame(results_list)
    if len(results_df) != 0:
        results_df.columns = ['left_id', 'right_id', 'full_name', 'comparison_name', 'similarity_score']
    else:
    # Can add error here if there's no results to return if desired.
        pass
    return results_df

leftdic = {}

for i, x in lefttable.iterrows():
    leftdic[x['title']] = x['id']

rightdic = {}
for i, x in righttable.iterrows():
    rightdic[x['title']] = x['id']

# print(leftdic)
# print(rightdic)

outputdf = pd.DataFrame(columns = ['full_name', 'comparison_name', 'similarity_score'])
for i, x in lefttable.iterrows():
    if i == 2553:
        break
#     name_list = [x['title']].append(righttable.iloc[:]['title'].tolist())
    name_list = [x['title']].append(righttable[righttable['brand'] == x['brand']].iloc[:]['title'].tolist())
    id_list = [x['id']].append(righttable[righttable['brand'] == x['brand']].iloc[:]['id'].tolist())
    similarity_threshold = 0
    ndigits = 3
    try:
        results_df = find_similar(id_list, name_list, similarity_threshold, ndigits)
    except:
        dummy = 1
    matches_df = results_df
#     matches_df = matches_df.sort_values(by = ['similarity_score'], ascending = False)
    matches_df = matches_df[matches_df['full_name'] == x['title']]
    matches_df = matches_df[matches_df['similarity_score'] < 0.99999]
    matches_df = matches_df[matches_df['similarity_score'] > 0.94]
    
    outputdf = outputdf.append(matches_df)

# output = output[output['similarity_score'] < 0.99999]
# output = output[output['similarity_score'] > 0.7]
outputdf = outputdf.sort_values(by = ['similarity_score'], ascending = False)

outputdf = outputdf.dropna()
# print(outputdf)

outputdf = outputdf.drop_duplicates(subset = ['full_name'])
outputdf = outputdf.drop_duplicates(subset = ['comparison_name'])

# resultdf = pd.DataFrame(columns = ['left_id', 'left', 'right_id', 'right'])
# for i, x in outputdf.iterrows():
#     if (x['full_name'] in leftdic.keys() and x['comparison_name'] in rightdic.keys()):
#         dic = {"left_id": leftdic[x['full_name']], "left": x['full_name'], "right_id": rightdic[x['comparison_name']], "right":x['comparison_name']}
#         resultdf = resultdf.append(dic, ignore_index = True)
# print(resultdf)

print(outputdf)

    
    





                                                 full_name  \
2730399  draper matte white access series e electric sc...   
195857   draper matte white ultimate access series e el...   
3142462  draper diamondscreen rear projection screen wi...   
2877667  draper matte white access series e electric sc...   
280722   draper matte white signature series e electric...   
...                                                    ...   
2353952    rubbermaid techfile plastic magazine file black   
2917519       bic round stic ball pen black 1-dozen 3-pack   
246960            mortimer beckett time paradox jewel case   
2659478  avery metal rim key tags card stock metal 1-1 ...   
2187913                                      case logic 16   

                                           comparison_name  similarity_score  
2730399  draper matte white access series e electric sc...             0.999  
195857   draper matte white ultimate access series e el...             0.999  
3142462  draper di

In [319]:
resultdf = pd.DataFrame(columns = ['left_id', 'left', 'right_id', 'right'])
for i, x in outputdf.iterrows():
    if (x['full_name'] in leftdic.keys() and x['comparison_name'] in rightdic.keys()):
        dic = {"left_id": leftdic[x['full_name']], "left": x['full_name'], "right_id": rightdic[x['comparison_name']], "right":x['comparison_name']}
        resultdf = resultdf.append(dic, ignore_index = True)


print(resultdf)

  left_id                                               left right_id  \
0     324  corsair xms2 2gb 2 x 1gb pc2-6400 800mhz 240-p...    21543   
1    2430  da-lite da-plex thru-the-wall rear projection ...    19283   
2    1229   michigan wolverines iphone 4 case silicone cover    10854   
3    1348  buffalo technology linkstation duo dual-bay 1....    12647   
4     765  startech 2 port pci express superspeed usb 3.0...    11288   
5    1242  gear head 2.4 ghz wireless optical nano mouse ...    18372   
6    1070               chicago cubs pink iphone 4 hard case     5560   
7     640         oklahoma sooners iphone 4 case black shell     6286   
8    1951  incipio ipod touch 4g feather hard shell case ...    15247   

                                               right  
0  corsair xms2 4 gb 2 x 2 gb pc2-6400 800 mhz 24...  
1  da-lite da-plex thru-the-wall rear projection ...  
2       michigan wolverines iphone 3g silicone cover  
3  buffalo technology linkstation duo 1 tb 2 x 50

In [334]:
resultdf = pd.DataFrame(columns = ['left_id', 'right_id', 'left', 'right'])

for i, x in lefttable.iterrows():
#     name_list = x['title']
    for i1, x1 in righttable[righttable['brand'] == x['brand']].iterrows():
        if x['modelno'] == x1['modelno']:
            dic = {"left_id": x['id'], "right_id": x1['id'],  "left": x['title'],  "right": x1['title']}
            
            resultdf = resultdf.append(dic, ignore_index = True)

# print(resultdf)
modellist = []
for i, x in resultdf.iterrows():
    modellist.append((x['left_id'],x['right_id']))
print(modellist)
    
        
    

[(3, 4378), (6, 9015), (7, 1303), (9, 2456), (15, 4282), (19, 13057), (26, 537), (29, 6031), (39, 10872), (41, 6816), (49, 8096), (57, 13302), (65, 11129), (66, 3668), (93, 8350), (102, 11759), (104, 11791), (107, 12090), (108, 21709), (114, 9223), (126, 1555), (128, 2701), (128, 10640), (130, 9890), (131, 5595), (133, 18758), (138, 11468), (148, 18936), (149, 19030), (159, 9709), (168, 20286), (176, 4854), (182, 7399), (186, 1928), (191, 1243), (191, 11197), (192, 11131), (193, 19880), (202, 2447), (205, 10517), (217, 5274), (218, 5046), (219, 8696), (222, 1354), (225, 12627), (239, 10617), (258, 10646), (259, 10734), (261, 3238), (261, 5192), (266, 2949), (272, 3965), (281, 7824), (286, 18626), (294, 19911), (299, 4644), (303, 19299), (307, 10209), (309, 19291), (312, 13299), (317, 1082), (319, 21483), (323, 1244), (324, 8519), (328, 14369), (329, 7314), (330, 1599), (334, 10661), (346, 8304), (351, 10048), (356, 21591), (360, 11566), (362, 3235), (375, 5041), (376, 9389), (378, 2169

In [335]:
sm = 0
trainlist = []
for i, x in train.iterrows():
    if x['label'] == 1:
        trainlist.append((x['ltable_id'],x['rtable_id']))
print(trainlist)

[(334, 10661), (2076, 18297), (983, 18364), (2523, 10365), (1899, 5458), (1467, 16059), (42, 172), (1324, 912), (781, 17483), (2344, 6970), (577, 19367), (1703, 17701), (2252, 16783), (2013, 9226), (1475, 8650), (446, 5774), (862, 5782), (1832, 13396), (588, 20108), (2515, 4424), (1280, 7637), (1893, 11120), (159, 5664), (194, 6876), (1578, 3323), (1951, 20443), (2118, 20621), (1618, 6364), (313, 8870), (1686, 17665), (742, 8235), (264, 16640), (1579, 11460), (2531, 19976), (1477, 2432), (2327, 8544), (145, 15891), (3, 4378), (2343, 4784), (1987, 10905), (1708, 16226), (266, 2949), (49, 8096), (752, 7075), (315, 7705), (1803, 6872), (1727, 14918), (2018, 11227), (1353, 6819), (639, 7371), (475, 20392), (1396, 2438), (157, 16205), (81, 17194), (1671, 20794), (201, 6828), (695, 12569), (2511, 21503), (2184, 11689), (1315, 21112), (770, 11223), (2021, 17451), (563, 18915), (86, 7610), (20, 16837), (1397, 20805), (545, 7624), (2164, 3694), (1776, 15489), (944, 14177), (1706, 17026), (531, 

In [346]:
print(len(modellist))
print(len(trainlist))
for pair in modellist:
    if pair in trainlist:
        modellist.remove(pair)
print(len(modellist))
print(modellist)
outputfile = pd.DataFrame(columns = ['ltable_id', 'rtable_id'])

for left, right in modellist:
    dic = {"ltable_id": left, "rtable_id": right}
    outputfile = outputfile.append(dic, ignore_index = True)

outputfile.to_csv("output.csv", index = False)


289
470
289
[(6, 9015), (7, 1303), (9, 2456), (19, 13057), (26, 537), (39, 10872), (65, 11129), (102, 11759), (104, 11791), (108, 21709), (114, 9223), (126, 1555), (128, 2701), (128, 10640), (133, 18758), (149, 19030), (168, 20286), (176, 4854), (191, 1243), (191, 11197), (193, 19880), (202, 2447), (218, 5046), (239, 10617), (259, 10734), (261, 3238), (281, 7824), (286, 18626), (299, 4644), (307, 10209), (309, 19291), (312, 13299), (317, 1082), (319, 21483), (323, 1244), (328, 14369), (329, 7314), (346, 8304), (356, 21591), (360, 11566), (362, 3235), (375, 5041), (376, 9389), (378, 21690), (386, 4058), (395, 99), (396, 7144), (416, 19475), (429, 3273), (448, 13173), (453, 10899), (457, 8018), (475, 781), (480, 376), (481, 986), (491, 19957), (502, 11435), (512, 9060), (522, 14014), (528, 21347), (547, 9295), (548, 3414), (567, 12173), (589, 15495), (599, 556), (600, 12881), (603, 21731), (604, 15829), (611, 8429), (623, 14735), (624, 4039), (636, 9151), (637, 15920), (639, 10128), (652