# Analysis Report
## Files
setC.csv = data obtained from the blocking stage  
sampleA.csv = 800 rows that are sample with (seed = 10) from setC

In [153]:
import csv

setC = None
with open('labelled.csv', 'r') as file:
    setA = list(csv.reader(file))
    setA = setA[1:] # Remove header
    setB = setA[350:] # Evaluation set
    setA = setA[:350] # Development set

class attr:
    label = 0
    _id = 1
    ltable_Id = 2
    rtable_Id = 3
    ltable_Title = 4
    ltable_Category = 5
    ltable_Duration = 6
    ltable_Rating = 7
    ltable_Rating_Count = 8
    ltable_Director = 9 
    rtable_Title = 10
    rtable_Category = 11
    rtable_Duration = 12
    rtable_Rating = 13
    rtable_Rating_Count = 14
    rtable_Director = 15
    strings = ['label', '_id', 'ltable_Id', 'rtable_Id', 'ltable_Title', 'ltable_Category', 
               'ltable_Duration', 'ltable_Rating', 'ltable_Rating_Count', 'ltable_Director', 
               'rtable_Title', 'rtable_Category', 'rtable_Duration', 'rtable_Rating', 'rtable_Rating_Count',
               'rtable_Director']

In [154]:
# Calculate number of null value for each attributes
def check_null(setx):
    num_null = [0 for i in range(16)]
    
    for row in setx:
        for pos, val in enumerate(row):
            if not val:
                num_null[pos] += 1
    
    for pos, val in enumerate(num_null):
        print(attr.strings[pos] + ": " + str(val))

In [155]:
# Define a function that scan the whole table and remove null value based on pos
def fill_null(setx, pos, val):
    for row in setx:
        if not row[pos]:
            row[pos] = val

In [156]:
# Values with null item, size of setA = 800
print("SetA")
check_null(setA)
fill_null(setA, attr.ltable_Rating, 0)
fill_null(setA, attr.rtable_Rating, 0)
print("SetB")
check_null(setB)
fill_null(setB, attr.ltable_Rating, 0)
fill_null(setB, attr.rtable_Rating, 0)

SetA
label: 0
_id: 0
ltable_Id: 0
rtable_Id: 0
ltable_Title: 0
ltable_Category: 5
ltable_Duration: 0
ltable_Rating: 39
ltable_Rating_Count: 39
ltable_Director: 107
rtable_Title: 0
rtable_Category: 18
rtable_Duration: 61
rtable_Rating: 93
rtable_Rating_Count: 0
rtable_Director: 44
SetB
label: 0
_id: 0
ltable_Id: 0
rtable_Id: 0
ltable_Title: 0
ltable_Category: 1
ltable_Duration: 0
ltable_Rating: 18
ltable_Rating_Count: 18
ltable_Director: 44
rtable_Title: 0
rtable_Category: 10
rtable_Duration: 25
rtable_Rating: 39
rtable_Rating_Count: 0
rtable_Director: 16


## Begin Matching
Start by converting each labelled row into a feature vector

In [157]:
from sklearn import tree, ensemble, linear_model, svm, naive_bayes
from sklearn.model_selection import KFold
from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer
from py_stringmatching.similarity_measure.levenshtein import Levenshtein

delim_tkn = DelimiterTokenizer()
lev = Levenshtein()

In [158]:
def title_match(tit_x, tit_y):
#     return len(tit_x) - len(tit_y)
    return lev.get_raw_score(tit_x, tit_y)

def category_match(cat_x, cat_y):
#     return len(cat_x) - len(cat_y)
    return lev.get_raw_score(cat_x, cat_y)
    
def rating_match(rat_x, rat_y):
    return abs(float(rat_x) - float(rat_y))
    
def director_match(dir_x, dir_y):
#     return len(dir_x) - len(dir_y)
    return lev.get_raw_score(dir_x, dir_y)

def rating_count_match(rat_x, rat_y):
    return abs(float(rat_x) - float(rat_y))

In [159]:
def get_feature(setx):
    feature = []
    label = []
    
    for row in setx:
        label += [row[attr.label]]
        
        x_0 = title_match(row[attr.ltable_Title], row[attr.rtable_Title])
        x_1 = category_match(row[attr.ltable_Category], row[attr.rtable_Category])
        x_2 = rating_match(row[attr.ltable_Rating], row[attr.rtable_Rating])
        x_3 = director_match(row[attr.ltable_Director], row[attr.rtable_Director])
#         x_4 = rating_count_match(row[attr.ltable_Rating_Count], row[attr.rtable_Rating_Count])
        
        feature += [[x_0, x_1, x_2, x_3]]
        
    return feature, label
    

In [160]:
def get_ltable(setx):
    return [[row[attr.ltable_Id]] + row[attr.ltable_Title:attr.ltable_Director + 1]for row in setx]

def get_rtable(setx):
    return [[row[attr.rtable_Id]] + row[attr.rtable_Title:]for row in setx]

def get_label(setx):
    return [row[attr.label] for row in setx]

In [161]:
# Given a list of real result and predicted result, calculate precision, recall and F1
def get_F1(real, predicted):
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    
    for pos,res in enumerate(predicted):
        if res == real[pos]:
            if res == '1':
                true_positive += 1
            else:
                true_negative += 1
        else:
            if res == '1':
                false_positive += 1
            else:
                false_negative += 1
                
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    F1 = 2 * precision * recall / (precision + recall)
    
    return precision, recall, F1

In [162]:
def debug(ltable, rtable, label, predicted):
    for pos,res in enumerate(predicted):
        if res != label[pos]:
            print("ltable: " + str(ltable[pos]))
            print("rtable: " + str(rtable[pos]))
            print("Label: " + str(label[pos]) + " Predicted: " + str(res))

In [163]:
# Decision Tree Classifier using k-Fold = 4
split = 4
k_fold = KFold(n_splits=split)
total_precision = 0
total_recall = 0
total_F1 = 0
x = 0
for train, test in k_fold.split(setA):
    train = setA[train[0]:train[-1] + 1]
    test = setA[test[0]:test[-1] + 1]
    
    feature, label = get_feature(train)
    feature_t, label_t = get_feature(test)
    
    t_clf = tree.DecisionTreeClassifier()
    t_clf = t_clf.fit(feature, label)

    result = t_clf.predict(feature_t)
    precision, recall, F1 = get_F1(label_t, result)
    
    total_precision += precision
    total_recall += recall
    total_F1 += F1
    
    print("ROUND: " + str(x))
    x += 1
    print("precision: " + str(precision))
    print("recall: " + str(recall))
    print("F1: " + str(F1))
#     debug(get_ltable(test), get_rtable(test), label_t, result)
    
print("AVERAGE:")
precision = total_precision/split
recall = total_recall/split
F1 = total_F1/split
print("precision: " + str(precision))
print("recall: " + str(recall))
print("F1: " + str(F1))

eval_f, eval_t = get_feature(setB)
rest = clf.predict(eval_f)
p,r,f = get_F1(eval_t, rest)

print(p)
print(r)
print(f)


ROUND: 0
precision: 0.9230769230769231
recall: 1.0
F1: 0.9600000000000001
ROUND: 1
precision: 1.0
recall: 1.0
F1: 1.0
ROUND: 2
precision: 1.0
recall: 1.0
F1: 1.0
ROUND: 3
precision: 0.8387096774193549
recall: 0.9629629629629629
F1: 0.896551724137931
AVERAGE:
precision: 0.9404466501240696
recall: 0.9907407407407407
F1: 0.9641379310344828
1.0
0.7954545454545454
0.8860759493670886


In [164]:
# Logistic Regression Classifier using k-Fold = 4
split = 4
k_fold = KFold(n_splits=split)
total_precision = 0
total_recall = 0
total_F1 = 0
x = 0

for train, test in k_fold.split(setA):
    train = setA[train[0]:train[-1] + 1]
    test = setA[test[0]:test[-1] + 1]
    
    feature, label = get_feature(train)
    feature_t, label_t = get_feature(test)

    clf = linear_model.LogisticRegression()
    clf = clf.fit(feature, label)

    result = clf.predict(feature_t)
    
    precision, recall, F1 = get_F1(label_t, result)
    
    total_precision += precision
    total_recall += recall
    total_F1 += F1
    
    print("ROUND: " + str(x))
    x += 1
    print("precision: " + str(precision))
    print("recall: " + str(recall))
    print("F1: " + str(F1))
#     debug(get_ltable(test), get_rtable(test), label_t, result)
    
print("AVERAGE:")
precision = total_precision/split
recall = total_recall/split
F1 = total_F1/split
print("precision: " + str(precision))
print("recall: " + str(recall))
print("F1: " + str(F1))

ROUND: 0
precision: 0.9230769230769231
recall: 1.0
F1: 0.9600000000000001
ROUND: 1
precision: 0.9655172413793104
recall: 0.8484848484848485
F1: 0.9032258064516129
ROUND: 2
precision: 0.8888888888888888
recall: 0.96
F1: 0.923076923076923
ROUND: 3
precision: 1.0
recall: 0.8518518518518519
F1: 0.92
AVERAGE:
precision: 0.9443707633362806
recall: 0.9150841750841752
F1: 0.926575682382134


In [165]:
# Random Forest Classifier using k-Fold = 4
split = 4
k_fold = KFold(n_splits=split)
total_precision = 0
total_recall = 0
total_F1 = 0
x = 0

for train, test in k_fold.split(setA):
    train = setA[train[0]:train[-1] + 1]
    test = setA[test[0]:test[-1] + 1]
    
    feature, label = get_feature(train)
    feature_t, label_t = get_feature(test)

    clf = ensemble.RandomForestClassifier()
    clf = clf.fit(feature, label)

    result = clf.predict(feature_t)
    
    precision, recall, F1 = get_F1(label_t, result)
    
    total_precision += precision
    total_recall += recall
    total_F1 += F1
    
    print("ROUND: " + str(x))
    x += 1
    print("precision: " + str(precision))
    print("recall: " + str(recall))
    print("F1: " + str(F1))
#     debug(get_ltable(test), get_rtable(test), label_t, result)
    
print("AVERAGE:")
precision = total_precision/split
recall = total_recall/split
F1 = total_F1/split
print("precision: " + str(precision))
print("recall: " + str(recall))
print("F1: " + str(F1))

ROUND: 0
precision: 0.8888888888888888
recall: 1.0
F1: 0.9411764705882353
ROUND: 1
precision: 1.0
recall: 0.9393939393939394
F1: 0.96875
ROUND: 2
precision: 1.0
recall: 1.0
F1: 1.0
ROUND: 3
precision: 0.9259259259259259
recall: 0.9259259259259259
F1: 0.9259259259259259
AVERAGE:
precision: 0.9537037037037037
recall: 0.9663299663299664
F1: 0.9589630991285404


In [166]:
# Support Vector Machine Classifier using k-Fold = 4
split = 4
k_fold = KFold(n_splits=split)
total_precision = 0
total_recall = 0
total_F1 = 0
x = 0

for train, test in k_fold.split(setA):
    train = setA[train[0]:train[-1] + 1]
    test = setA[test[0]:test[-1] + 1]
    
    feature, label = get_feature(train)
    feature_t, label_t = get_feature(test)

    clf = svm.SVC()
    clf = clf.fit(feature, label)

    result = clf.predict(feature_t)
    
    precision, recall, F1 = get_F1(label_t, result)
    
    total_precision += precision
    total_recall += recall
    total_F1 += F1
    
    print("ROUND: " + str(x))
    x += 1
    print("precision: " + str(precision))
    print("recall: " + str(recall))
    print("F1: " + str(F1))
#     debug(get_ltable(test), get_rtable(test), label_t, result)
    
print("AVERAGE:")
precision = total_precision/split
recall = total_recall/split
F1 = total_F1/split
print("precision: " + str(precision))
print("recall: " + str(recall))
print("F1: " + str(F1))

ROUND: 0
precision: 1.0
recall: 0.6666666666666666
F1: 0.8
ROUND: 1
precision: 1.0
recall: 1.0
F1: 1.0
ROUND: 2
precision: 1.0
recall: 1.0
F1: 1.0
ROUND: 3
precision: 1.0
recall: 0.5555555555555556
F1: 0.7142857142857143
AVERAGE:
precision: 1.0
recall: 0.8055555555555556
F1: 0.8785714285714286


In [168]:
# Naive Bayesian Classifier using k-Fold = 4
split = 4
k_fold = KFold(n_splits=split)
total_precision = 0
total_recall = 0
total_F1 = 0
x = 0

for train, test in k_fold.split(setA):
    train = setA[train[0]:train[-1] + 1]
    test = setA[test[0]:test[-1] + 1]
    
    feature, label = get_feature(train)
    feature_t, label_t = get_feature(test)

    clf = naive_bayes.GaussianNB()
    clf = clf.fit(feature, label)

    result = clf.predict(feature_t)
    
    precision, recall, F1 = get_F1(label_t, result)
    
    total_precision += precision
    total_recall += recall
    total_F1 += F1
    
    print("ROUND: " + str(x))
    x += 1
    print("precision: " + str(precision))
    print("recall: " + str(recall))
    print("F1: " + str(F1))
#     debug(get_ltable(test), get_rtable(test), label_t, result)
    
print("AVERAGE:")
precision = total_precision/split
recall = total_recall/split
F1 = total_F1/split
print("precision: " + str(precision))
print("recall: " + str(recall))
print("F1: " + str(F1))

eval_f, eval_t = get_feature(setB)
rest = clf.predict(eval_f)
p,r,f = get_F1(eval_t, rest)

print(p)
print(r)
print(f)

ROUND: 0
precision: 0.8275862068965517
recall: 1.0
F1: 0.9056603773584906
ROUND: 1
precision: 0.9032258064516129
recall: 0.8484848484848485
F1: 0.875
ROUND: 2
precision: 0.8571428571428571
recall: 0.96
F1: 0.9056603773584904
ROUND: 3
precision: 1.0
recall: 0.8518518518518519
F1: 0.92
AVERAGE:
precision: 0.8969887176227554
recall: 0.9150841750841752
F1: 0.9015801886792452
0.8163265306122449
0.9090909090909091
0.8602150537634408
