# Analysis Report
## Files
setC.csv = data obtained from the blocking stage  
sampleA.csv = 800 rows that are sample with (seed = 10) from setC

In [148]:
import csv

setC = None
with open('sampleA.csv', 'r') as file:
    setA = list(csv.reader(file))

class attr:
    label = 0
    _id = 1
    ltable_Id = 2
    rtable_Id = 3
    ltable_Title = 4
    ltable_Category = 5
    ltable_Duration = 6
    ltable_Rating = 7
    ltable_Rating_Count = 8
    ltable_Director = 9 
    rtable_Title = 10
    rtable_Category = 11
    rtable_Duration = 12
    rtable_Rating = 13
    rtable_Rating_Count = 14
    rtable_Director = 15
    strings = ['label', '_id', 'ltable_Id', 'rtable_Id', 'ltable_Title', 'ltable_Category', 
               'ltable_Duration', 'ltable_Rating', 'ltable_Rating_Count', 'ltable_Director', 
               'rtable_Title', 'rtable_Category', 'rtable_Duration', 'rtable_Rating', 'rtable_Rating_Count',
               'rtable_Director']

In [149]:
# Calculate number of null value for each attributes
def check_null(setx):
    num_null = [0 for i in range(16)]
    
    for row in setx:
        for pos, val in enumerate(row):
            if not val:
                num_null[pos] += 1
    
    for pos, val in enumerate(num_null):
        print(attr.strings[pos] + ": " + str(val))

In [150]:
# Define a function that scan the whole table and remove null value based on pos
def fill_null(setx, pos, val):
    for row in setx:
        if not row[pos]:
            row[pos] = val

In [152]:
# Values with null item, size of setA = 800
check_null(setA)
fill_null(setA, attr.ltable_Rating, 0)
fill_null(setA, attr.rtable_Rating, 0)

label: 0
_id: 0
ltable_Id: 0
rtable_Id: 0
ltable_Title: 0
ltable_Category: 12
ltable_Duration: 0
ltable_Rating: 93
ltable_Rating_Count: 93
ltable_Director: 248
rtable_Title: 0
rtable_Category: 38
rtable_Duration: 131
rtable_Rating: 201
rtable_Rating_Count: 0
rtable_Director: 97


## Begin Matching
Start by converting each labelled row into a feature vector

In [153]:
from sklearn import tree
from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer
from py_stringmatching.similarity_measure.levenshtein import Levenshtein

delim_tkn = DelimiterTokenizer()
lev = Levenshtein()

In [154]:
def title_match(tit_x, tit_y):
    return lev.get_raw_score(tit_x, tit_y)

def category_match(cat_x, cat_y):
    return lev.get_raw_score(cat_x, cat_y)
    
def rating_match(rat_x, rat_y):
    return abs(float(rat_x) - float(rat_y))
    
def director_match(dir_x, dir_y):
    return lev.get_raw_score(dir_x, dir_y)

In [170]:
def get_feature(setx):
    feature = []
    label = []
    
    for row in setx:
        label += [row[attr.label]]
        
        x_0 = title_match(row[attr.ltable_Title], row[attr.rtable_Title])
        x_1 = category_match(row[attr.ltable_Category], row[attr.rtable_Category])
        x_2 = rating_match(row[attr.ltable_Rating], row[attr.rtable_Rating])
        x_3 = director_match(row[attr.ltable_Director], row[attr.rtable_Director])
        
        feature += [[x_0, x_1, x_2, x_3]]
        
    return feature, label
    

In [181]:
def get_ltable(setx):
    return [[row[attr.ltable_Id]] + row[attr.ltable_Title:attr.ltable_Director + 1]for row in setx]

def get_rtable(setx):
    return [[row[attr.rtable_Id]] + row[attr.rtable_Title:]for row in setx]

def get_label(setx):
    return [row[attr.label] for row in setx]

In [171]:
# Decision Tree Classifier
feature, label = get_feature(setA[1:])

clf = tree.DecisionTreeClassifier()
clf = clf.fit(feature, label)

In [185]:
# Test the classifier
setT = setA[1:11]
x, y = get_feature(setT)

result = clf.predict(x)
ltable = get_ltable(setT)
rtable = get_rtable(setT)
label = get_label(setT)

for pos,res in enumerate(result):
    print("ltable: " + str(ltable[pos]))
    print("rtable: " + str(rtable[pos]))
    print("Result: " + str(res) + " Label: " + str(label[pos]))

ltable: ['13312', 'Day of the Dove', 'Action,Adventure,Mystery', '51', '80.0', '1143.0', 'Marvin J. Chomsky']
rtable: ['482', 'Enemy of the State', 'Action,Adventure,Mystery', '150.0', '70.0', '372512', 'Tony Scott,']
Result: -1 Label: -1
ltable: ['512', 'Grumpy Old Men', 'Comedy,Drama,Romance', '103', '69.0', '34973.0', 'Donald Petrie']
rtable: ['7988', 'Grumpy Old Men', 'Comedy,Drama', '103.0', '64.0', '154021', 'Donald Petrie,']
Result: -1 Label: -1
ltable: ['9308', 'American Gladiators', 'Action,Television,Sport', '60', '66.0', '1516.0', '']
rtable: ['8970', 'American Splendor', 'Art House,International,Comedy,Drama', '100.0', '74.0', '44354', 'Robert Pulcini,Shari Springer Berman,']
Result: -1 Label: -1
ltable: ['10728', 'Iceman', 'Drama,Sci-Fi', '100', '61.0', '4148.0', 'Fred Schepisi']
rtable: ['5738', 'Iceman', 'Drama,Science Fiction,Fantasy', '101.0', '60.0', '3130', 'Fred Schepisi,']
Result: -1 Label: -1
ltable: ['13447', 'Tom & Viv', 'Biography,Drama,Romance', '115', '65.0',