# Analysis Report
## Files
setC.csv = data obtained from the blocking stage  
sampleA.csv = 800 rows that are sample with (seed = 10) from setC

In [3]:
import csv

setC = None
with open('labelled.csv', 'r') as file:
    setA = list(csv.reader(file))
    setA = setA[1:]

class attr:
    label = 0
    _id = 1
    ltable_Id = 2
    rtable_Id = 3
    ltable_Title = 4
    ltable_Category = 5
    ltable_Duration = 6
    ltable_Rating = 7
    ltable_Rating_Count = 8
    ltable_Director = 9 
    rtable_Title = 10
    rtable_Category = 11
    rtable_Duration = 12
    rtable_Rating = 13
    rtable_Rating_Count = 14
    rtable_Director = 15
    strings = ['label', '_id', 'ltable_Id', 'rtable_Id', 'ltable_Title', 'ltable_Category', 
               'ltable_Duration', 'ltable_Rating', 'ltable_Rating_Count', 'ltable_Director', 
               'rtable_Title', 'rtable_Category', 'rtable_Duration', 'rtable_Rating', 'rtable_Rating_Count',
               'rtable_Director']

In [4]:
# Calculate number of null value for each attributes
def check_null(setx):
    num_null = [0 for i in range(16)]
    
    for row in setx:
        for pos, val in enumerate(row):
            if not val:
                num_null[pos] += 1
    
    for pos, val in enumerate(num_null):
        print(attr.strings[pos] + ": " + str(val))

In [5]:
# Define a function that scan the whole table and remove null value based on pos
def fill_null(setx, pos, val):
    for row in setx:
        if not row[pos]:
            row[pos] = val

In [6]:
# Values with null item, size of setA = 800
check_null(setA)
fill_null(setA, attr.ltable_Rating, 0)
fill_null(setA, attr.rtable_Rating, 0)

label: 0
_id: 0
ltable_Id: 0
rtable_Id: 0
ltable_Title: 0
ltable_Category: 6
ltable_Duration: 0
ltable_Rating: 57
ltable_Rating_Count: 57
ltable_Director: 151
rtable_Title: 0
rtable_Category: 28
rtable_Duration: 86
rtable_Rating: 132
rtable_Rating_Count: 0
rtable_Director: 60


## Begin Matching
Start by converting each labelled row into a feature vector

In [7]:
from sklearn import tree, ensemble, linear_model, svm, naive_bayes
from sklearn.model_selection import KFold
from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer
from py_stringmatching.similarity_measure.levenshtein import Levenshtein

delim_tkn = DelimiterTokenizer()
lev = Levenshtein()

In [8]:
def title_match(tit_x, tit_y):
    return lev.get_raw_score(tit_x, tit_y)

def category_match(cat_x, cat_y):
    return lev.get_raw_score(cat_x, cat_y)
    
def rating_match(rat_x, rat_y):
    return abs(float(rat_x) - float(rat_y))
    
def director_match(dir_x, dir_y):
    return lev.get_raw_score(dir_x, dir_y)

In [9]:
def get_feature(setx):
    feature = []
    label = []
    
    for row in setx:
        label += [row[attr.label]]
        
        x_0 = title_match(row[attr.ltable_Title], row[attr.rtable_Title])
        x_1 = category_match(row[attr.ltable_Category], row[attr.rtable_Category])
        x_2 = rating_match(row[attr.ltable_Rating], row[attr.rtable_Rating])
        x_3 = director_match(row[attr.ltable_Director], row[attr.rtable_Director])
        
        feature += [[x_0, x_1, x_2, x_3]]
        
    return feature, label
    

In [10]:
def get_ltable(setx):
    return [[row[attr.ltable_Id]] + row[attr.ltable_Title:attr.ltable_Director + 1]for row in setx]

def get_rtable(setx):
    return [[row[attr.rtable_Id]] + row[attr.rtable_Title:]for row in setx]

def get_label(setx):
    return [row[attr.label] for row in setx]

In [58]:
# Given a list of real result and predicted result, calculate precision, recall and F1
def get_F1(real, predicted):
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    
    for pos,res in enumerate(predicted):
        if res == real[pos]:
            if res:
                true_positive += 1
            else:
                true_negative += 1
        else:
            if res == '1':
                false_positive += 1
            else:
                false_negative += 1
                
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    F1 = 2 * precision * recall / (precision + recall)
    
    return precision, recall, F1

In [28]:
def debug(ltable, rtable, label, predicted):
    for pos,res in enumerate(predicted):
        if res != label[pos]:
            print("ltable: " + str(ltable[pos]))
            print("rtable: " + str(rtable[pos]))
            print("Label: " + str(label[pos]) + " Predicted: " + str(res))

In [59]:
# Decision Tree Classifier using k-Fold = 4
split = 4
k_fold = KFold(n_splits=split)
total_precision = 0
total_recall = 0
total_F1 = 0
x = 0
for train, test in k_fold.split(setA):
    train = setA[train[0]:train[-1] + 1]
    test = setA[test[0]:test[-1] + 1]
    
    feature, label = get_feature(train)
    feature_t, label_t = get_feature(test)

    t_clf = tree.DecisionTreeClassifier()
    t_clf = t_clf.fit(feature, label)

    result = t_clf.predict(feature_t)
    
    precision, recall, F1 = get_F1(label_t, result)
    
    total_precision += precision
    total_recall += recall
    total_F1 += F1
    
    print("ROUND: " + str(x))
    x += 1
    print("precision: " + str(precision))
    print("recall: " + str(recall))
    print("F1: " + str(F1))
#     debug(get_ltable(test), get_rtable(test), label_t, result)
    
print("AVERAGE:")
precision = total_precision/split
recall = total_recall/split
F1 = total_F1/split
print("precision: " + str(precision))
print("recall: " + str(recall))
print("F1: " + str(F1))


ROUND: 0
precision: 0.9512195121951219
recall: 0.9831932773109243
F1: 0.9669421487603305
ROUND: 1
precision: 1.0
recall: 1.0
F1: 1.0
ROUND: 2
precision: 1.0
recall: 1.0
F1: 1.0
ROUND: 3
precision: 0.9426229508196722
recall: 0.9829059829059829
F1: 0.9623430962343097
AVERAGE:
precision: 0.9734606157536985
recall: 0.9915248150542268
F1: 0.9823213112486601


In [60]:
# Logistic Regression Classifier using k-Fold = 4
split = 4
k_fold = KFold(n_splits=split)
total_precision = 0
total_recall = 0
total_F1 = 0
x = 0

for train, test in k_fold.split(setA):
    train = setA[train[0]:train[-1] + 1]
    test = setA[test[0]:test[-1] + 1]
    
    feature, label = get_feature(train)
    feature_t, label_t = get_feature(test)

    clf = linear_model.LogisticRegression()
    clf = clf.fit(feature, label)

    result = clf.predict(feature_t)
    
    precision, recall, F1 = get_F1(label_t, result)
    
    total_precision += precision
    total_recall += recall
    total_F1 += F1
    
    print("ROUND: " + str(x))
    x += 1
    print("precision: " + str(precision))
    print("recall: " + str(recall))
    print("F1: " + str(F1))
#     debug(get_ltable(test), get_rtable(test), label_t, result)
    
print("AVERAGE:")
precision = total_precision/split
recall = total_recall/split
F1 = total_F1/split
print("precision: " + str(precision))
print("recall: " + str(recall))
print("F1: " + str(F1))

ROUND: 0
precision: 0.975609756097561
recall: 0.9836065573770492
F1: 0.9795918367346939
ROUND: 1
precision: 0.9752066115702479
recall: 0.9672131147540983
F1: 0.9711934156378601
ROUND: 2
precision: 0.9916666666666667
recall: 0.9596774193548387
F1: 0.9754098360655739
ROUND: 3
precision: 0.9754098360655737
recall: 0.9834710743801653
F1: 0.9794238683127573
AVERAGE:
precision: 0.9794732176000123
recall: 0.9734920414665379
F1: 0.9764047391877213


In [61]:
# Random Forest Classifier using k-Fold = 4
split = 4
k_fold = KFold(n_splits=split)
total_precision = 0
total_recall = 0
total_F1 = 0
x = 0

for train, test in k_fold.split(setA):
    train = setA[train[0]:train[-1] + 1]
    test = setA[test[0]:test[-1] + 1]
    
    feature, label = get_feature(train)
    feature_t, label_t = get_feature(test)

    clf = ensemble.RandomForestClassifier()
    clf = clf.fit(feature, label)

    result = clf.predict(feature_t)
    
    precision, recall, F1 = get_F1(label_t, result)
    
    total_precision += precision
    total_recall += recall
    total_F1 += F1
    
    print("ROUND: " + str(x))
    x += 1
    print("precision: " + str(precision))
    print("recall: " + str(recall))
    print("F1: " + str(F1))
#     debug(get_ltable(test), get_rtable(test), label_t, result)
    
print("AVERAGE:")
precision = total_precision/split
recall = total_recall/split
F1 = total_F1/split
print("precision: " + str(precision))
print("recall: " + str(recall))
print("F1: " + str(F1))

ROUND: 0
precision: 0.983739837398374
recall: 0.983739837398374
F1: 0.983739837398374
ROUND: 1
precision: 1.0
recall: 1.0
F1: 1.0
ROUND: 2
precision: 1.0
recall: 1.0
F1: 1.0
ROUND: 3
precision: 0.9672131147540983
recall: 0.9833333333333333
F1: 0.9752066115702478
AVERAGE:
precision: 0.9877382380381181
recall: 0.9917682926829269
F1: 0.9897366122421555


In [62]:
# Support Vector Machine Classifier using k-Fold = 4
split = 4
k_fold = KFold(n_splits=split)
total_precision = 0
total_recall = 0
total_F1 = 0
x = 0

for train, test in k_fold.split(setA):
    train = setA[train[0]:train[-1] + 1]
    test = setA[test[0]:test[-1] + 1]
    
    feature, label = get_feature(train)
    feature_t, label_t = get_feature(test)

    clf = svm.SVC()
    clf = clf.fit(feature, label)

    result = clf.predict(feature_t)
    
    precision, recall, F1 = get_F1(label_t, result)
    
    total_precision += precision
    total_recall += recall
    total_F1 += F1
    
    print("ROUND: " + str(x))
    x += 1
    print("precision: " + str(precision))
    print("recall: " + str(recall))
    print("F1: " + str(F1))
#     debug(get_ltable(test), get_rtable(test), label_t, result)
    
print("AVERAGE:")
precision = total_precision/split
recall = total_recall/split
F1 = total_F1/split
print("precision: " + str(precision))
print("recall: " + str(recall))
print("F1: " + str(F1))

ROUND: 0
precision: 1.0
recall: 0.896
F1: 0.9451476793248946
ROUND: 1
precision: 1.0
recall: 1.0
F1: 1.0
ROUND: 2
precision: 1.0
recall: 1.0
F1: 1.0
ROUND: 3
precision: 1.0
recall: 0.9354838709677419
F1: 0.9666666666666666
AVERAGE:
precision: 1.0
recall: 0.9578709677419355
F1: 0.9779535864978903


In [63]:
# Naive Bayesian Classifier using k-Fold = 4
split = 4
k_fold = KFold(n_splits=split)
total_precision = 0
total_recall = 0
total_F1 = 0
x = 0

for train, test in k_fold.split(setA):
    train = setA[train[0]:train[-1] + 1]
    test = setA[test[0]:test[-1] + 1]
    
    feature, label = get_feature(train)
    feature_t, label_t = get_feature(test)

    clf = naive_bayes.GaussianNB()
    clf = clf.fit(feature, label)

    result = clf.predict(feature_t)
    
    precision, recall, F1 = get_F1(label_t, result)
    
    total_precision += precision
    total_recall += recall
    total_F1 += F1
    
    print("ROUND: " + str(x))
    x += 1
    print("precision: " + str(precision))
    print("recall: " + str(recall))
    print("F1: " + str(F1))
#     debug(get_ltable(test), get_rtable(test), label_t, result)
    
print("AVERAGE:")
precision = total_precision/split
recall = total_recall/split
F1 = total_F1/split
print("precision: " + str(precision))
print("recall: " + str(recall))
print("F1: " + str(F1))
    

ROUND: 0
precision: 0.975609756097561
recall: 0.9836065573770492
F1: 0.9795918367346939
ROUND: 1
precision: 0.9752066115702479
recall: 0.9672131147540983
F1: 0.9711934156378601
ROUND: 2
precision: 0.9831932773109243
recall: 0.9512195121951219
F1: 0.9669421487603305
ROUND: 3
precision: 0.9262295081967213
recall: 0.9826086956521739
F1: 0.9535864978902953
AVERAGE:
precision: 0.9650597882938636
recall: 0.9711619699946108
F1: 0.9678284747557949
