In [1]:
#import all the libraries and data files

#header
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import csv
from collections import defaultdict

#classifiers
from sklearn.neighbors import KNeighborsClassifier     #KNN
from sklearn.svm import SVC, LinearSVC     #svm
from sklearn.tree import DecisionTreeClassifier     #decisionTree
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, LinearRegression
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
#score
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import classification_report, precision_recall_fscore_support

#Visualizing DecisionTree
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus

  from numpy.core.umath_tests import inner1d


In [2]:
#data
directory = "./datasets"

SepFile = [
    "Abt-Buy",
    "Amazon-GoogleProducts",
    "DBLP-ACM",
    "DBLP-Scholar"
]

SepFeatures = [
    "features_Abt_Buy",
    "features_Amazon_GoogleProducts",
    "features_DBLP2_ACM",
    "features_DBLP1_Scholar"   
]

SingleFile = [
    "Anime",
    "Baby_products",
    "Beer",
    "Bikes",
    "Books1",
    "Books2",
    "Books3",
    "Books4",
    "Books5",
    "Citations",
    "Ebooks1",
    "Ebooks2",
    "Electronics",
    "Movies1",
    "Movies3",
    "Movies5",
    "Music",
    "Restaurants1",
    "Restaurants2",
    "Restaurants3",
    "Restaurants4"
]

Features = [
    "features_my_anime_list_anime_planet",
    "features_babies_r_us_buy_buy_baby",
    "features_beer_advocate_rate_beer",
    "features_bikedekho_bikewale",
    "features_amazon_barnes_and_noble",
    "features_goodreads_barnes_and_noble",
    "features_barnes_and_noble_half",
    "features_amazon_barnes_and_noble",
    "features_amazon_barnes_and_noble",
    "features_google_scholar_dblp",
    "features_itunes_ebooks",
    "features_itunes_ebooks",
    "features_amazon_best_buy",
    "features_rotten_tomatoes_imdb",
    "features_imdb_rotten_tomatoes",
    "features_roger_ebert_imdb",
    "features_itunes_amazon_music",
    "features_zomato_yelp",
    "features_zomato_yelp",
    "features_yelp_yellow_pages",
    "features_yellow_pages_yelp"
]

In [8]:
classifiers = {#7
    "KNN": KNeighborsClassifier(2),
    "SVM": LinearSVC()
    "DecisionTree": DecisionTreeClassifier(random_state=1),
    "RandomForest": RandomForestClassifier(random_state=1),
    "NaiveBayes": GaussianNB(),
    "LogisticRegression": LogisticRegression(random_state=1, solver='liblinear'),
    #"LinearRegression()" : LinearRegression(),
    "XGB": XGBClassifier(objective="binary:logistic", random_state=1)
}

In [9]:
#already splited
with open('prf_scores.csv', mode='a') as f:
    f.write("classifier,")
    for clf in classifiers.keys():
        f.write("%s,,," % clf)
    f.write("\n")
    f.write(",")
    for clf in classifiers.keys():
        f.write("precision,recall,f1,")
    f.write("\n")
    
    #load the feature file, seperate columns with features and labels
    for rep, feature in zip(SepFile, SepFeatures):
        f.write("%s," % rep )
        train = pd.read_csv("%s/%s/%s_train" %(directory, rep, feature))
        test = pd.read_csv("%s/%s/%s_test" %(directory, rep, feature))
    
        #delete ids & label on train set
        del train['source_id']
        del train['target_id']
        del train['pair_id']
        #store label(matching result)
        y = train['label']
        del train['label']

        #delete ids & label on test set
        del test['source_id']
        del test['target_id']
        del test['pair_id']
        #store label(matching result)
        test_true = test['label']
        del test['label']
        X = train
        
        """
        #show decision tree
        clf = DecisionTreeClassifier(random_state=1)
        clf.fit(X, y)
        dot_data = StringIO()
        export_graphviz(clf, out_file=dot_data, filled=True, rounded=True, special_characters=True, feature_names=train.columns, class_names=['0', '1'])
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        graph.write_png('%s_DecisionTree.png' % rep )
        Image(graph.create_png())
        """
        for clf in classifiers.values():
            #import pdb; pdb.set_trace();
            clf.fit(X, y)
            #print(clf)
            score = precision_recall_fscore_support(clf.predict(test), test_true, average='binary')
            for i in range(0,3):
                f.write("%s," % score[i].round(3))
        f.write("\n")
   

  if diff:
  if diff:
  if diff:
  if diff:


In [10]:
#cross-validation
with open('prf_scores.csv', mode='a') as f:
    for rep, feature in zip(SingleFile, Features):
        f.write("%s," % rep )
        data = pd.read_csv("%s/%s/%s" %(directory, rep, feature))

        #delete ids & label on train set
        del data['source_id']
        del data['target_id']
        del data['pair_id']
        #store label(matching result)
        y = data['label']
        del data['label']

        X = data
        """
        #show decision tree
        clf = DecisionTreeClassifier(random_state=1)
        clf.fit(X, y)
        dot_data = StringIO()
        export_graphviz(clf, out_file=dot_data, filled=True, rounded=True, special_characters=True, feature_names=data.columns, class_names=['0', '1'])
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        graph.write_png('%s_DecisionTree.png' % rep )
        Image(graph.create_png())
        """
        for clf in classifiers.values():
            #import pdb; pdb.set_trace();
            #predicted = cross_val_predict(clf,data,y,cv=5)
            precision = []
            precision.append(cross_val_score(clf, data, y, cv=5, scoring='precision').mean().round(3))
            recall = []
            recall.append(cross_val_score(clf, data, y, cv=5, scoring='recall').mean().round(3)) #split 5
            f1 = []
            f1.append(cross_val_score(clf, data, y, cv=5, scoring='f1').mean().round(3))
            
            for p in precision:
                f.write("%s," % p)
            for r in recall:
                f.write("%s," % r)
            for fs in f1:
                f.write("%s," % fs)
        f.write("\n")


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


In [12]:
from collections import Counter
#train-test-split
with open('prf_scores.csv', mode='a') as f:
    #load the feature file, seperate columns with features and labels
    for rep, feature in zip(SingleFile, Features):
        f.write("%s," % rep )
        data = pd.read_csv("%s/%s/%s" %(directory, rep, feature))
        train, test = train_test_split(data, test_size = 0.2, random_state = 42) #split to 4:1

        #delete ids & label on train set
        del train['source_id']
        del train['target_id']
        del train['pair_id']
        #store label(matching result)
        y = train['label']
        y[y] = 1.0
        y[y==False] = 0.0
        del train['label']

        #delete ids & label on test set
        del test['source_id']
        del test['target_id']
        del test['pair_id']
        #store label(matching result)
        test_true = test['label']
        test_true[test_true] = 1.0
        test_true[test_true==False] = 0.0
        del test['label']
        X = train
        
        """
        #show decision tree
        clf = DecisionTreeClassifier(random_state=1)
        clf.fit(X, y)
        dot_data = StringIO()
        export_graphviz(clf, out_file=dot_data, filled=True, rounded=True, special_characters=True, feature_names=train.columns, class_names=['0', '1'])
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        graph.write_png('%s_DecisionTree.png' % rep )
        Image(graph.create_png())
        """
        for clf in classifiers.values():
            #import pdb; pdb.set_trace();
            clf.fit(X, y)
            #print(clf)
            score = precision_recall_fscore_support(clf.predict(test), test_true, average='binary')
            for i in range(0,3):
                f.write("%s," % score[i].round(3))
        f.write("\n")
   

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  'recall', 'true', average, warn_for)
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
