# Analysis Report
## Files
setC.csv = data obtained from the blocking stage  
sampleA.csv = 800 rows that are sample with (seed = 10) from setC

In [2]:
import csv

setC = None
with open('labelled.csv', 'r') as file:
    setA = list(csv.reader(file))
    setA = setA[1:] # Remove header
    setB = setA[350:] # Evaluation set
    setA = setA[:350] # Development set

class attr:
    label = 0
    _id = 1
    ltable_Id = 2
    rtable_Id = 3
    ltable_Title = 4
    ltable_Category = 5
    ltable_Duration = 6
    ltable_Rating = 7
    ltable_Rating_Count = 8
    ltable_Director = 9 
    rtable_Title = 10
    rtable_Category = 11
    rtable_Duration = 12
    rtable_Rating = 13
    rtable_Rating_Count = 14
    rtable_Director = 15
    strings = ['label', '_id', 'ltable_Id', 'rtable_Id', 'ltable_Title', 'ltable_Category', 
               'ltable_Duration', 'ltable_Rating', 'ltable_Rating_Count', 'ltable_Director', 
               'rtable_Title', 'rtable_Category', 'rtable_Duration', 'rtable_Rating', 'rtable_Rating_Count',
               'rtable_Director']

In [3]:
# Calculate number of null value for each attributes
def check_null(setx):
    num_null = [0 for i in range(16)]
    
    for row in setx:
        for pos, val in enumerate(row):
            if not val:
                num_null[pos] += 1
    
    for pos, val in enumerate(num_null):
        print(attr.strings[pos] + ": " + str(val))

In [4]:
# Define a function that scan the whole table and remove null value based on pos
def fill_null(setx, pos, val):
    for row in setx:
        if not row[pos]:
            row[pos] = val

In [5]:
# Values with null item, size of setA = 800
# print("SetA")
# check_null(setA)
fill_null(setA, attr.ltable_Rating, 0)
fill_null(setA, attr.rtable_Rating, 0)
# print("SetB")
# check_null(setB)
fill_null(setB, attr.ltable_Rating, 0)
fill_null(setB, attr.rtable_Rating, 0)

## Begin Matching
Start by converting each labelled row into a feature vector

In [6]:
from sklearn import tree, ensemble, linear_model, svm, naive_bayes
from sklearn.model_selection import KFold
from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer
from py_stringmatching.similarity_measure.levenshtein import Levenshtein

delim_tkn = DelimiterTokenizer()
lev = Levenshtein()

In [9]:
def title_match(tit_x, tit_y):
    return lev.get_raw_score(tit_x, tit_y)

def category_match(cat_x, cat_y):
    return lev.get_raw_score(cat_x, cat_y)
    
def rating_match(rat_x, rat_y):
    return abs(float(rat_x) - float(rat_y))
    
def director_match(dir_x, dir_y):
    return lev.get_raw_score(dir_x, dir_y)

def rating_count_match(rat_x, rat_y):
    return abs(float(rat_x) - float(rat_y))

In [10]:
def get_feature(setx):
    feature = []
    label = []
    
    for row in setx:
        label += [row[attr.label]]
        
        x_0 = title_match(row[attr.ltable_Title], row[attr.rtable_Title])
        x_1 = category_match(row[attr.ltable_Category], row[attr.rtable_Category])
        x_2 = rating_match(row[attr.ltable_Rating], row[attr.rtable_Rating])
        x_3 = director_match(row[attr.ltable_Director], row[attr.rtable_Director])
#         x_4 = rating_count_match(row[attr.ltable_Rating_Count], row[attr.rtable_Rating_Count])
        
        feature += [[x_0, x_1, x_2, x_3]]
        
    return feature, label
    

In [11]:
def get_ltable(setx):
    return [[row[attr.ltable_Id]] + row[attr.ltable_Title:attr.ltable_Director + 1]for row in setx]

def get_rtable(setx):
    return [[row[attr.rtable_Id]] + row[attr.rtable_Title:]for row in setx]

def get_label(setx):
    return [row[attr.label] for row in setx]

In [81]:
# Given a list of real result and predicted result, calculate precision, recall and F1
def get_F1(real, predicted):
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    
    for pos,res in enumerate(predicted):
        if res == real[pos]:
            if res == '1':
                true_positive += 1
            else:
                true_negative += 1
        else:
            if res == '1':
                false_positive += 1
            else:
                false_negative += 1
    
    # If true_positive, false_positive or false_negative causes zero error: set precision, recall and F1 to zero
    try:
        precision = true_positive / (true_positive + false_positive)
        recall = true_positive / (true_positive + false_negative)
        F1 = 2 * precision * recall / (precision + recall)
    except ZeroDivisionError:
        return 0,0,0
    
    return precision, recall, F1

In [13]:
def debug(ltable, rtable, label, predicted):
    for pos,res in enumerate(predicted):
        if res != label[pos]:
            print("ltable: " + str(ltable[pos]))
            print("rtable: " + str(rtable[pos]))
            print("Label: " + str(label[pos]) + " Predicted: " + str(res))

In [45]:
# Test setx using the classifier = clf
def clf_test(setx, clf, test_name='TEST', verbose=False):
    feature, label = get_feature(setx)
    result = clf.predict(feature)
    precision, recall, F1 = get_F1(label, result)
    
    if verbose:
        print(test_name.upper())
        print("Precision: " + str(precision))
        print("Recall: " + str(recall))
        print("F1: " + str(F1))
    
    return precision, recall, F1

In [67]:
# Train and test on setx using the classifier = clf and k-fold validation = k
# Return the average (precision, recall, F1)
def clf_train(setx, clf, k, name='CLASSIFIER', verbose=False):
    # Decision Tree Classifier using k-Fold = 4
    split = 4
    k_fold = KFold(n_splits=split)
    total_precision = 0
    total_recall = 0
    total_F1 = 0
    
    for train, test in k_fold.split(setx):
        train = setx[train[0]:train[-1] + 1]
        test = setx[test[0]:test[-1] + 1]

        feature, label = get_feature(train)
        clf = clf.fit(feature, label)
        
        precision, recall, F1 = clf_test(test, clf)

        total_precision += precision
        total_recall += recall
        total_F1 += F1
        
    precision = total_precision/split
    recall = total_recall/split
    F1 = total_F1/split
    if verbose:
        print(name.upper())
        print("Precision: " + str(precision))
        print("Recall: " + str(recall))
        print("F1: " + str(F1))
    
    return precision, recall, F1

In [76]:
# Decision Tree Classifier using k-Fold = 4
clf = tree.DecisionTreeClassifier()
precision, recall, F1 = clf_train(setA, clf, 4, "decision tree classifier", verbose=True)
print()
precision_b, recall_b, F1_b = clf_test(setB, clf, "evaluation on decision tree classifier", verbose=True)

DECISION TREE CLASSIFIER
Precision: 0.931899641577061
Recall: 0.9907407407407407
F1: 0.9594320486815416

EVALUATION ON DECISION TREE CLASSIFIER
Precision: 0.7454545454545455
Recall: 0.9318181818181818
F1: 0.8282828282828283


In [75]:
# Logistic Regression Classifier using k-Fold = 4
clf = linear_model.LogisticRegression()
precision, recall, F1 = clf_train(setA, clf, 4, "logistic regression classifier", verbose=True)
print()
precision_b, recall_b, F1_b = clf_test(setB, clf, "evaluation on logistic regression classifier", verbose=True)

LOGISTIC REGRESSION CLASSIFIER
Precision: 0.9443707633362806
Recall: 0.9150841750841752
F1: 0.926575682382134

EVALUATION ON LOGISTIC REGRESSION CLASSIFIER
Precision: 0.9090909090909091
Recall: 0.9090909090909091
F1: 0.9090909090909091


In [77]:
# Random Forest Classifier using k-Fold = 4
clf = ensemble.RandomForestClassifier()
precision, recall, F1 = clf_train(setA, clf, 4, "random forest classifier", verbose=True)
print()
precision_b, recall_b, F1_b = clf_test(setB, clf, "evaluation on random forest classifier", verbose=True)

RANDOM FOREST CLASSIFIER
Precision: 0.98
Recall: 0.9803240740740741
F1: 0.979976896418945

EVALUATION ON RANDOM FOREST CLASSIFIER
Precision: 0.9130434782608695
Recall: 0.9545454545454546
F1: 0.9333333333333332


In [78]:
# Support Vector Machine Classifier using k-Fold = 4
clf = svm.SVC()
precision, recall, F1 = clf_train(setA, clf, 4, "support vector machine classifier", verbose=True)
print()
precision_b, recall_b, F1_b = clf_test(setB, clf, "evaluation on support vector machine classifier", verbose=True)

SUPPORT VECTOR MACHINE CLASSIFIER
Precision: 1.0
Recall: 0.8055555555555556
F1: 0.8785714285714286

EVALUATION ON SUPPORT VECTOR MACHINE CLASSIFIER
Precision: 1.0
Recall: 0.5681818181818182
F1: 0.7246376811594203


In [80]:
# Naive Bayes Classifier using k-Fold = 4
clf = naive_bayes.GaussianNB()
precision, recall, F1 = clf_train(setA, clf, 4, "naive bayes classifier", verbose=True)
print()
precision_b, recall_b, F1_b = clf_test(setB, clf, "evaluation on naive bayes classifier", verbose=True)

NAIVE BAYES CLASSIFIER
Precision: 0.8969887176227554
Recall: 0.9150841750841752
F1: 0.9015801886792452

EVALUATION ON NAIVE BAYES CLASSIFIER
Precision: 0.8163265306122449
Recall: 0.9090909090909091
F1: 0.8602150537634408
