In [1]:
import pandas as pd
import numpy as np
import re

In [138]:
# Model selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold, cross_val_score
from imblearn.pipeline import make_pipeline as make_imb_pipeline
from imblearn.under_sampling import EditedNearestNeighbours, CondensedNearestNeighbour, RandomUnderSampler
from sklearn.metrics import precision_recall_fscore_support

import networkx as nx
pd.options.mode.chained_assignment = None


In [3]:
from datasketch import MinHash, MinHashLSH

In [4]:
from support_functions import *

In [5]:
fs = pd.read_json("train/foursquare_train.json")
locu = pd.read_json("train/locu_train.json")
truth = pd.read_csv("train/matches_train.csv")
fs_test = pd.read_json("online_competition/foursquare_test.json")
locu_test = pd.read_json("online_competition/locu_test.json")

In [6]:
print('foursquare train shape',fs.shape)
print('locu train shape',locu.shape)
print('truth train shape',truth.shape)
print('foursquare test shape',fs_test.shape)
print('locu test shape',locu_test.shape)

foursquare train shape (600, 11)
locu train shape (600, 11)
truth train shape (360, 2)
foursquare test shape (400, 11)
locu test shape (400, 11)


In [7]:
fs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 600 entries, 0 to 599
Data columns (total 11 columns):
country           600 non-null object
id                600 non-null object
latitude          600 non-null float64
locality          600 non-null object
longitude         600 non-null float64
name              600 non-null object
phone             314 non-null object
postal_code       600 non-null object
region            600 non-null object
street_address    600 non-null object
website           600 non-null object
dtypes: float64(2), object(9)
memory usage: 56.2+ KB


In [8]:
locu.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 600 entries, 0 to 599
Data columns (total 11 columns):
country           600 non-null object
id                600 non-null object
latitude          599 non-null float64
locality          600 non-null object
longitude         599 non-null float64
name              600 non-null object
phone             600 non-null object
postal_code       600 non-null object
region            600 non-null object
street_address    600 non-null object
website           600 non-null object
dtypes: float64(2), object(9)
memory usage: 56.2+ KB


In [9]:
fs.head()

Unnamed: 0,country,id,latitude,locality,longitude,name,phone,postal_code,region,street_address,website
0,United States,4f328ea619836c91c7e3714a,40.794855,New York,-73.966069,Chen Jin Diao Restaurant,(212) 678-4680,,NY,800 Columbus Ave.,
1,United States,4c37b5f6ae2da593a56affc5,40.76046,New York,-73.99492,West Side Steakhouse,(212) 564-4803,10036.0,NY,597 10th Ave.,
2,United States,4b41060df964a52098bf25e3,40.760249,New York,-73.983746,Pearls: Chinese & Szechuan Cuisine,(212) 582-7380,10019.0,NY,732 7th Ave.,
3,United States,3fd66200f964a520ece41ee3,40.70866,New York,-74.011263,Suspenders,(212) 732-5005,10006.0,NY,111 Broadway,http://www.suspendersnyc.com
4,United States,52064aab11d284f64d088329,40.718105,New York,-73.996096,Grand Century Cafe,,,NY,,


In [10]:
locu.head()

Unnamed: 0,country,id,latitude,locality,longitude,name,phone,postal_code,region,street_address,website
0,United States,cc9e8f40230c6ead2873,40.739822,New York,-73.985144,Chipotle Mexican Grill,2126736904,10010,NY,125 East 23rd St.,http://www.chipotle.com/
1,United States,81df045e563fb6cab7f7,40.810765,New York,-73.952591,Honey Salon Inc,2126630100,10026,NY,174 Saint Nicholas Ave.,
2,United States,b265cf6c80121211dbfc,40.74358,New York,-73.986127,Palatte,6464763812,10016,NY,66 Madison Ave.,http://www.palattenyc.com/
3,United States,4fc50abefae5311cc2b3,40.82882,New York,-73.949022,Best Taste Restaurant,2122815691,10031,NY,3609 Broadway,
4,United States,72560dc41f1b7aed0d47,40.749936,New York,-73.983849,Integra Hair System Inc,2125636786,10018,NY,11 W. 36th St. # 3,http://www.integrahair.com/


In [11]:
locu_test.head()

Unnamed: 0,country,id,latitude,locality,longitude,name,phone,postal_code,region,street_address,website
0,United States,b48da849c54f904013e2,40.758005,New York,-73.992727,Panda Restaurant,2126950836,10036,NY,570 9th Ave.,http://pandarg.com/
1,United States,95ad783fd1c65bb8fdbf,40.721025,New York,-73.982903,El Maguey y la Tuna,2124733919,10002,NY,321 East Houston St.,http://www.elmagueyylatunamex.com
2,United States,5060d123ccad77923b20,40.725772,New York,-73.991947,Hair Date Salon/ Professionals Hair Cut,2122288381,10003,NY,329 Bowery,http://www.hairdatenyc.com/
3,United States,9dd6f6b177096efd5da4,40.802047,New York,-73.936988,Pizza Plus Inc,2122890005,10035,NY,2253 3rd Ave. # 1,http://www.pizzaplusmore.com/
4,United States,4773c30d2df4368c0d09,40.72519,New York,-73.992547,Double Crown,2122540350,10012,NY,316 Bowery,http://doublecrown-nyc.com


In [12]:
fs_test.head()

Unnamed: 0,country,id,latitude,locality,longitude,name,phone,postal_code,region,street_address,website
0,United States,4eefac5cb8f76a24a7b12202,40.73282,New York,-73.998113,Sticky's Finger Joint,(212) 777-7131,10011.0,NY,31 W. 8th St.,
1,United States,4fb6aeabe4b02861a894f317,40.769622,New York,-73.964026,Café 7,,10021.0,NY,725 Park Ave. 7th Floor,
2,United States,4f1227d9e4b03856f12a915d,40.722477,New York,-73.994903,232 Mott,,10012.0,NY,232 Mott St.,
3,United States,51ddfccf498eefb41d65b1f1,40.757635,New York,-73.985828,"Hard Rock Cafe,New York",,,NY,,
4,United States,4d71199c783f8cfa4efae1d6,40.713705,New York,-73.990162,Lo Mein/Noodles Cart,,,NY,Rutgers St.,


# Supervised Learning

### Construct train set

In [13]:
# Format phone numbers in train and test set
locu.phone = format_phone(locu.phone)
locu_test.phone = format_phone(locu_test.phone)
fs.phone = format_phone(fs.phone)
fs_test.phone = format_phone(fs_test.phone)

In [14]:
relevant_comb = find_relevant_comb(locu,fs,threshold=0.05)
train_set = create_train_set(locu,fs,truth,relevant_comb)
train_set = create_features(train_set)

In [15]:
train_set.head()

Unnamed: 0,locu_id,foursquare_id,match,name_dist,add_dist,add_exist,long_dist,lat_dist,phone_exist,phone_match
0,70f97b0ce676d041ab5c,49ff7bcef964a5202a701fe3,1,8,0,1,4.8e-05,0.000145,1,1
1,2b02c737979dd30050b3,49ff7bcef964a5202a701fe3,0,14,20,1,0.00314,0.00192,1,0
2,29cc2329ed7e31224f5b,49ff7bcef964a5202a701fe3,0,15,24,1,0.002516,0.003647,1,0
3,bb19fa24ef5f523a89f4,49ff7bcef964a5202a701fe3,0,15,21,1,0.000719,0.003148,1,0
4,86ea4fd41e60bdda40be,49ff7bcef964a5202a701fe3,0,26,9999,0,0.004139,0.005391,1,0


In [136]:
train_set.shape

(112437, 10)

### Construct test set

In [16]:
relevant_comb_test = find_relevant_comb(locu_test,fs_test,threshold=0.05)
test_set = create_test_set(locu_test,fs_test,relevant_comb_test)

Unnamed: 0,locu_id,foursquare_id,latitude_locu,longitude_locu,name_locu,street_address_locu,phone_locu,latitude_fs,longitude_fs,name_fs,street_address_fs,phone_fs
0,b48da849c54f904013e2,4fb6aeabe4b02861a894f317,40.758005,-73.992727,Panda Restaurant,570 9th Ave.,2126950836,40.769622,-73.964026,Café 7,725 Park Ave. 7th Floor,0
1,9dd6f6b177096efd5da4,4fb6aeabe4b02861a894f317,40.802047,-73.936988,Pizza Plus Inc,2253 3rd Ave. # 1,2122890005,40.769622,-73.964026,Café 7,725 Park Ave. 7th Floor,0
2,206c363a5907bfa98ec0,4fb6aeabe4b02861a894f317,40.737576,-73.996471,Pink Tea Cup,538 6th Ave.,2122060605,40.769622,-73.964026,Café 7,725 Park Ave. 7th Floor,0
3,cb95d1e0730222cc3209,4fb6aeabe4b02861a894f317,40.760685,-73.98257,Roses Mexicano,1st Ave.,0,40.769622,-73.964026,Café 7,725 Park Ave. 7th Floor,0
4,25ca87e725b930488ed6,4fb6aeabe4b02861a894f317,40.755325,-73.990906,Starbucks,600 Eighth Ave.,2129977341,40.769622,-73.964026,Café 7,725 Park Ave. 7th Floor,0
5,daff4926a1d14a6c2921,4fb6aeabe4b02861a894f317,40.765683,-73.976401,Trump's World Tower Valet,106 Central Park South,2127591068,40.769622,-73.964026,Café 7,725 Park Ave. 7th Floor,0
6,5013d96a9633f92f2dbf,4fb6aeabe4b02861a894f317,40.751522,-73.980221,Hopkins Foodservice Specialists,280 Madison Ave.,2126799293,40.769622,-73.964026,Café 7,725 Park Ave. 7th Floor,0
7,8f922b60da3c4795c589,4fb6aeabe4b02861a894f317,40.66055,-73.960699,Jaquira Beauty Salon,544 Flatbush Ave.,7184624281,40.769622,-73.964026,Café 7,725 Park Ave. 7th Floor,0
8,1ff1b1c1d5252ca3682a,4fb6aeabe4b02861a894f317,40.748853,-73.992386,Harrington's Bar & Grill,370 7th Ave.,2127363636,40.769622,-73.964026,Café 7,725 Park Ave. 7th Floor,0
9,d837156a7175ece3df6e,4fb6aeabe4b02861a894f317,40.826815,-73.946587,Naty Unisex,1766 Amsterdam Ave. # 1,0,40.769622,-73.964026,Café 7,725 Park Ave. 7th Floor,0


In [17]:
test_set = create_features(test_set)
test_set.head(10)

Unnamed: 0,locu_id,foursquare_id,name_dist,add_dist,add_exist,long_dist,lat_dist,phone_exist,phone_match
0,b48da849c54f904013e2,4fb6aeabe4b02861a894f317,14,21,1,0.028701,0.011617,1,0
1,9dd6f6b177096efd5da4,4fb6aeabe4b02861a894f317,12,18,1,0.027038,0.032425,1,0
2,206c363a5907bfa98ec0,4fb6aeabe4b02861a894f317,11,21,1,0.032445,0.032046,1,0
3,cb95d1e0730222cc3209,4fb6aeabe4b02861a894f317,13,21,1,0.018544,0.008937,0,0
4,25ca87e725b930488ed6,4fb6aeabe4b02861a894f317,8,26,1,0.02688,0.014297,1,0
5,daff4926a1d14a6c2921,4fb6aeabe4b02861a894f317,24,29,1,0.012375,0.003939,1,0
6,5013d96a9633f92f2dbf,4fb6aeabe4b02861a894f317,29,23,1,0.016195,0.0181,1,0
7,8f922b60da3c4795c589,4fb6aeabe4b02861a894f317,18,24,1,0.003327,0.109072,1,0
8,1ff1b1c1d5252ca3682a,4fb6aeabe4b02861a894f317,22,21,1,0.02836,0.020769,1,0
9,d837156a7175ece3df6e,4fb6aeabe4b02861a894f317,9,24,1,0.017439,0.057193,0,0


In [137]:
test_set.shape

(52626, 9)

### Model Selection

In [19]:
X_train = train_set.drop(['match','locu_id','foursquare_id'],axis=1).values
y_train = train_set['match'].values

In [20]:
X_test = test_set.drop(['locu_id','foursquare_id'],axis=1).values

In [21]:
print('X_train',X_train.shape)
print('y_train',y_train.shape)
print('X_test',X_test.shape)

X_train (112437, 7)
y_train (112437,)
X_test (52626, 7)


tree = DecisionTreeClassifier(max_features='auto')
resampled_rf = BalancedBaggingClassifier(base_estimator=tree,
                                         n_estimators=100, random_state=0)
scores = cross_validate(resampled_rf,
                        X_train, y_train, cv=StratifiedKFold(n_splits=3), scoring=('f1', 'precision','recall'))
scores['test_f1'].mean(), scores['test_precision'].mean(), scores['test_recall'].mean()

In [None]:
enn_pipe_rf = make_imb_pipeline(EditedNearestNeighbours(n_neighbors= 5),
                                  RandomForestClassifier(n_estimators=200,max_depth=4))
scores = cross_val_score(enn_pipe_rf, X_train, y_train, cv=3, scoring='precision')
np.mean(scores)

In [139]:
enn_pipe_rf = make_imb_pipeline(EditedNearestNeighbours(n_neighbors= 5),
                                  LogisticRegression())
scores = cross_val_score(enn_pipe_rf, X_train, y_train, cv=3, scoring='precision')
np.mean(scores)

0.93954504134979933

In [None]:
cnn_pipe = make_imb_pipeline(CondensedNearestNeighbour(),
                              RandomForestClassifier(n_estimators=100))
scores = cross_val_score(cnn_pipe, X_train, y_train, cv=3, scoring='f1')
np.mean(scores)

In [None]:
undersample_pipe_rf = make_imb_pipeline(RandomUnderSampler(),
                                        RandomForestClassifier(n_estimators=200,max_depth=5))
scores = cross_validate(undersample_pipe_rf,
                        X_train, y_train, cv=3, scoring=('f1','precision','recall'))
scores['test_f1'].mean(), scores['test_precision'].mean(), scores['test_recall'].mean()

In [None]:
scores = cross_validate(RandomForestClassifier(n_estimators=200, max_depth=10,class_weight='balanced'),
                        X_train, y_train, cv=3, scoring=('f1','precision','recall'))
scores['test_f1'].mean(), scores['test_precision'].mean(), scores['test_recall'].mean()

In [62]:
truth_list = [tuple(x) for x in truth.values]

In [147]:
# Pick model, fit, and get the predicted probabilities for  train and test sets
enn_pipe_rf = make_imb_pipeline(EditedNearestNeighbours(n_neighbors= 5),
                                  RandomForestClassifier(n_estimators=300,max_depth=5,max_leaf_nodes=24,n_jobs=3))
enn_pipe_rf.fit(X_train,y_train)
predicted_proba_train = enn_pipe_rf.predict_proba(X_train)
predicted_proba_test = enn_pipe_rf.predict_proba(X_test)
graph_structure_train,locu_ids_train = get_graph_structure(predicted_proba_train,0.5,train_set)
matches_train = bipartile_match(graph_structure_train,locu_ids_train)
compute_metrics(truth_list,matches_train)
graph_structure_test,locu_ids_test = get_graph_structure(predicted_proba_test,0.5,test_set)
matches_test = bipartile_match(graph_structure_test,locu_ids_test)
print('unique ids:',pd.DataFrame(graph_structure_test)[0].nunique())
print('final:',len(matches_test))

precision = 1.0
recall = 0.9694444444444444
f1-score = 0.9844851904090269
unique ids: 234
final: 234


In [143]:
# Random Forest model
rf = RandomForestClassifier(n_estimators=500,max_depth=5,max_leaf_nodes=24,n_jobs=3,random_state=5,class_weight='balanced')
rf.fit(X_train,y_train)
predicted_proba_train = rf.predict_proba(X_train)
predicted_proba_test = rf.predict_proba(X_test)
graph_structure_train,locu_ids_train = get_graph_structure(predicted_proba_train,0.5,train_set)
matches_train = bipartile_match(graph_structure_train,locu_ids_train)
compute_metrics(truth_list,matches_train)
graph_structure_test,locu_ids_test = get_graph_structure(predicted_proba_test,0.5,test_set)
matches_test = bipartile_match(graph_structure_test,locu_ids_test)
print('unique ids:',pd.DataFrame(graph_structure_test)[0].nunique())
print('final:',len(matches_test))

precision = 0.9943977591036415
recall = 0.9861111111111112
f1-score = 0.9902370990237099
unique ids: 233
final: 233


In [140]:
rf.feature_importances_

array([ 0.20161265,  0.13579246,  0.0051319 ,  0.27312919,  0.30643524,
        0.00613951,  0.07175906])

In [146]:
#pd.DataFrame(matches_test,columns=['locu_id','foursquare_id']).to_csv('matches_test.csv',index=False)