In [1]:
pip install fuzzywuzzy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import textdistance
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
import nltk
from nltk.stem.porter import *

porterStemmer = PorterStemmer()

In [3]:
#load in the data
amazon=pd.read_csv('amazon_small.csv',encoding = 'ISO-8859-1')
google=pd.read_csv('google_small.csv',encoding = 'ISO-8859-1')
truth = pd.read_csv('amazon_google_truth_small.csv')

In [4]:
# obtain the cross product of the two dataframe
amazon['key'] = 1
google['key'] = 1
df = pd.merge(amazon, google, on='key')
del df['key']

# replace nan cells as empty string
df["description_x"].replace(np.nan,"",inplace = True)
df["description_y"].replace(np.nan,"", inplace = True)

In [5]:
def tonkenised_string(string):
    '''
    function that passes a string to tokenise and clean as the prepossessing for data matching
    '''
    # tokenise the strings and remove all punctuations
    tokens = nltk.word_tokenize(string)
    words = [word.lower() for word in tokens if (word.isalpha() or word.isnumeric())]

    # remove all stopwords
    from nltk.corpus import stopwords
    stopWords = set(stopwords.words('english'))
    filteredList = [w for w in words if not w in stopWords]

    # stemming
    wordList = []
    for word in filteredList:
        wordList.append(porterStemmer.stem(word))
    
    return wordList

In [6]:
#obtain the distance for each pair of product
name_list = []
dscp_list = []
price_list = []
total_list = []
for i in range(len(df)):
    name_x = ' '.join(word for word in tonkenised_string(df['title'][i]))
    name_y = ' '.join(word for word in tonkenised_string(df['name'][i]))
    name_distance = fuzz.ratio(name_x , name_y)
    name_list.append(name_distance)

    dscp_distance = textdistance.overlap(tonkenised_string(df['description_x'][i]), tonkenised_string(df['description_y'][i]))
    dscp_list.append(dscp_distance) 
    
    price_distance = abs((df['price_x'][i]-df['price_y'][i])/max(df['price_x'][i],df['price_y'][i]))
    price_list.append(price_distance) 

In [24]:
df['name_distance'] = name_list
df['dscp_distance'] = dscp_list
df['price_distance'] = price_list
df['total_distance'] = df['name_distance']*0.01+0.1*df['dscp_distance']-0.1*df['price_distance']
df

Unnamed: 0,idAmazon,title,description_x,manufacturer_x,price_x,idGoogleBase,name,description_y,manufacturer_y,price_y,name_distance,dscp_distance,price_distance,total_distance,match
0,b0002itt84,rise of nations: gold (mac),rise of nations combines the thrills and speed...,macsoft,49.99,http://www.google.com/base/feeds/snippets/1329...,encore software 10599 - encore registry mechan...,encore software 10599 : pc tools registry mech...,,25.97,26,0.000000,0.480496,0.211950,0
1,b0002itt84,rise of nations: gold (mac),rise of nations combines the thrills and speed...,macsoft,49.99,http://www.google.com/base/feeds/snippets/1693...,adobe indesign cs3 for mac upgrade from pagemaker,system requirements powerpc g4 or g5 or intel ...,,205.99,38,0.000000,0.757318,0.304268,0
2,b0002itt84,rise of nations: gold (mac),rise of nations combines the thrills and speed...,macsoft,49.99,http://www.google.com/base/feeds/snippets/4998...,encore software 10568 - encore hoyle puzzle & ...,encore software 10568 : with old classics and ...,,17.97,22,0.166667,0.640528,0.172614,0
3,b0002itt84,rise of nations: gold (mac),rise of nations combines the thrills and speed...,macsoft,49.99,http://www.google.com/base/feeds/snippets/1695...,school zone interactive multiplication & divis...,multiplication & division introduces mutliplic...,,9.45,35,0.055556,0.810962,0.274459,0
4,b0002itt84,rise of nations: gold (mac),rise of nations combines the thrills and speed...,macsoft,49.99,http://www.google.com/base/feeds/snippets/1574...,apple software m9953z/a remote desktop 2.2 10 ...,the 10-client version allows one administrator...,apple software,294.99,29,0.111111,0.830537,0.218057,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22195,b000qfrt4o,professional home design suite platinum,do you believe your home is your castle? make ...,punch! software,99.95,http://www.google.com/base/feeds/snippets/3924...,the printshop pro publisher v22 deluxe dvd-rom,overview create professional projects with all...,,69.90,41,0.318182,0.300650,0.411753,0
22196,b000qfrt4o,professional home design suite platinum,do you believe your home is your castle? make ...,punch! software,99.95,http://www.google.com/base/feeds/snippets/1352...,webroot software 65210 spy sweeper 3 pc,spy sweeper 4.5 with the most advanced blockin...,webroot software,31.99,32,0.100000,0.679940,0.262006,0
22197,b000qfrt4o,professional home design suite platinum,do you believe your home is your castle? make ...,punch! software,99.95,http://www.google.com/base/feeds/snippets/1837...,microsoft b21-00806 ae mappoint 2006 cd,ae mappoint 2006 cd,,50.39,34,0.000000,0.495848,0.290415,0
22198,b000qfrt4o,professional home design suite platinum,do you believe your home is your castle? make ...,punch! software,99.95,http://www.google.com/base/feeds/snippets/1677...,emedia music corp my guitar,,,24.81,37,0.000000,0.751776,0.294822,0


In [22]:
df['match']=0

truth=pd.read_csv('amazon_google_truth_small.csv',encoding = 'ISO-8859-1')

for i in range(len(df)):
    for j in range(len(truth)):
         if df['idAmazon'][i] == truth['idAmazon'][j] and df['idGoogleBase'][i] == truth['idGoogleBase'][j]:
                df['match'][i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [23]:
# linear regression feature importance
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot

# define dataset
X = df[['name_distance', 'dscp_distance','price_distance']]
y = df[['match']]
# define the model
model = LinearRegression()
# fit the model
model.fit(X, y)
# get importance
importance = model.coef_
print(importance)

[[ 0.0034312   0.09492837 -0.0263316 ]]


In [30]:
aid = []
gid = []
threshold = 0.58
for i in range(len(df)):
    if df['total_distance'][i] > threshold and df['price_distance'][i] < 0.2:
        aid.append(df['idAmazon'][i])
        gid.append(df['idGoogleBase'][i])
#match_pair = pd.DataFrame({'idAmazon': aid, 'idGoogle': gid})
match_product = pd.DataFrame({'idAmazon': aid, 'idGoogleBase': gid})
match_product.to_csv('task1a.csv', index=False)
len(match_product)

77

In [31]:
t = len(truth)
tp = 0
fn = 0
fp = 0
for i in range(len(match_product)):
    matched = 0
    for j in range(len(truth)):
        if match_product["idAmazon"][i] == truth["idAmazon"][j] and match_product['idGoogleBase'][i] == truth['idGoogleBase'][j]:
            tp += 1
            matched = 1
    if not matched:
            fp += 1
fn = t - tp
print(tp)
print(fp)
print(fn)

64
13
66


In [32]:
precision = tp/(fp+tp)
precision

0.8311688311688312

In [33]:
recall = tp/(tp+fn)
recall

0.49230769230769234