In [None]:
import collections
import re
import string
import scipy
from scipy import sparse
import sklearn
import numpy as np

In [None]:
import random
with open('train_and_dev.tsv','r') as f:
    all_doc = f.readlines()
    #Shuffle the order of the data
    random.shuffle(all_doc)
    training_data = all_doc[:int(len(all_doc)*0.9)]
    dev_data = all_doc[int(len(all_doc)*0.9):]
    print(len(all_doc),len(training_data),len(dev_data))

In [None]:
with open('test.tsv', 'r') as f_test:
    test_data = f_test.readlines()

In [None]:
def preprocess_data(data):
    
    chars_to_remove = re.compile(f'[{string.punctuation}]')
    
    documents = []
    categories = []
    vocab = set([])
    
    for line in data:
        line = line.strip()
        if line:
            category, content = line.split('\t')
            
            words = chars_to_remove.sub(' ', content).lower().split()
            for word in words:
                vocab.add(word)
            documents.append(words)
            categories.append(category)
    return documents, categories, vocab

In [None]:

preprocessed_training_data, training_categories, train_vocab = preprocess_data(training_data)
preprocessed_dev_data, dev_categories, dev_vocab = preprocess_data(dev_data)
preprocessed_test_data, test_categories, test_vocab = preprocess_data(test_data)

print("Training data has %d documents and vocab size of %d" %(len(preprocessed_training_data), len(train_vocab)))
print("Dev data has %d documents and vocab size of %d" %(len(preprocessed_dev_data), len(dev_vocab)))
print("There are %d categories in the training data and %d in the dev" %(len(set(training_categories)), len(set(dev_categories))))

In [None]:
word2id = {}
for word_id,word in enumerate(train_vocab):
    word2id[word] = word_id
    
cat2id = {}
for cat_id,cat in enumerate(set(training_categories)):
    cat2id[cat] = cat_id
    
print("The word id for son is", word2id['son'])
print("The category id for Pets & animals is", cat2id['OT'])

In [None]:
#Convert data to bag-of-words format
def convert_to_bow_matrix(preprocessed_data, word2id):
    matrix_size = (len(preprocessed_data),len(word2id)+1)
    oov_index = len(word2id)
    # matrix index by [doc_id, token_id]
    X = scipy.sparse.dok_matrix(matrix_size)
    
    for doc_id,doc in enumerate(preprocessed_data):
        for word in doc:
            #default is 0, so add to the count for this word in the doc
            #if the word is oov, increment the oov_index
            X[doc_id,word2id.get(word,oov_index)] += 1
    return X

In [None]:
%%time
X_train = convert_to_bow_matrix(preprocessed_training_data, word2id)

In [None]:
#check some docs
print("First 3 documents are:", X_train[:3])

In [None]:
#labels to predict
y_train = [cat2id[cat] for cat in training_categories]
#check the first 3 categories
print(y_train[:3])

In [None]:
#Training an SVM model

#import the lib for support vector machines
from sklearn import svm

model = sklearn.svm.SVC(C=1000, gamma='scale')
#Start training the model
model.fit(X_train,y_train)

In [None]:
#evaluate on training data: how well did we fit to the data we trained on?
y_train_predictions = model.predict(X_train)

#compute any metrics we care about. Let's do accuracy
def compute_accuracy(predictions, true_values):
    num_correct = 0
    num_total = len(predictions)
    for predicted, true in zip(predictions, true_values):
        if predicted==true:
            num_correct += 1
    return num_correct / num_total

accuracy = compute_accuracy(y_train_predictions, y_train)
print("Accuracy:", accuracy)

In [None]:
#prepare dev data in the same was as training data
X_dev = convert_to_bow_matrix(preprocessed_dev_data, word2id)
y_dev = [cat2id[cat] for cat in dev_categories]

In [None]:
#prepare test data in the same was as training data
X_test = convert_to_bow_matrix(preprocessed_test_data, word2id)
y_test = [cat2id[cat] for cat in test_categories]

In [None]:
#evaluate on dev data
y_dev_predictions = model.predict(X_dev)
accuracy = compute_accuracy(y_dev_predictions, y_dev)
print("Accuracy:", accuracy)

#identify 3 instances from the development set that the baseline system labels incorrectly
print("Quran=%d,OT=%d,NT=%d" %(cat2id['Quran'], cat2id['OT'], cat2id['NT']))
count=0
for i,j in zip(y_dev_predictions,y_dev):
    if i!=j:
        print("pred=%d,true=%d" %(i,j))
        print(preprocessed_dev_data[count])
    count+=1

In [None]:
#evaluate on test data
y_test_predictions = model.predict(X_test)
accuracy = compute_accuracy(y_test_predictions, y_test)
print("Accuracy:", accuracy)

In [None]:
#about baseline
baseline_predictions = [cat2id['OT']] * len(y_test)
baseline_accuracy = compute_accuracy(baseline_predictions, y_test)
print("Accuracy:", baseline_accuracy)

In [None]:
'''try diefferent model
from sklearn import ensemble
model = sklearn.ensemble.RandomForestClassifier()
model.fit(X_train, y_train)

y_train_predictions = model.predict(X_train)
print("Train accuracy was:", compute_accuracy(y_train_predictions,y_train))
y_dev_predictions = model.predict(X_dev)
print("dev accuracy was:", compute_accuracy(y_dev_predictions, y_dev))'''

In [None]:
#calculate for Quran
def prf1_calculation(class_name, cat2id, predictions, true_values):
    class_id = cat2id[class_name]
    TP=TN=FP=FN=0
    num_total = len(predictions)
    for predicted, true in zip(predictions, true_values):
        if predicted==class_id and true==class_id:
            TP+=1
        elif predicted!=class_id and true!=class_id:
            TN+=1
        elif predicted==class_id and true!=class_id:
            FP+=1
        elif predicted!=class_id and true==class_id:
            FN+=1
        
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1 = (2*precision*recall)/(precision+recall)
    return precision, recall, f1

def print_line(cat2id, predictions, true_values):
    p_quran, r_quran, f_quran = prf1_calculation('Quran', cat2id, predictions, true_values)
    p_ot, r_ot, f_ot = prf1_calculation('OT', cat2id, predictions, true_values)
    p_nt, r_nt, f_nt = prf1_calculation('NT', cat2id, predictions, true_values)
    p_macro = (p_quran+p_ot+p_nt)/3
    r_macro = (r_quran+r_ot+r_nt)/3
    f_macro = (f_quran+f_ot+f_nt)/3
    line = [p_quran,r_quran,f_quran,p_ot,r_ot,f_ot,p_nt,r_nt,f_nt,p_macro,r_macro,f_macro]
    return line

print("baseline,train,",end='')
line = print_line(cat2id, y_train_predictions, y_train)
for id,single in enumerate(line):
    if(id==len(line)-1):
        print("%.3f" %single)
    else:
        print("%.3f" %single, end=',')
        
print("baseline,dev,",end='')
line = print_line(cat2id, y_dev_predictions, y_dev)
for id,single in enumerate(line):
    if(id==len(line)-1):
        print("%.3f" %single)
    else:
        print("%.3f" %single, end=',')
        
print("baseline,test,",end='')
line = print_line(cat2id, y_test_predictions, y_test)
for id,single in enumerate(line):
    if(id==len(line)-1):
        print("%.3f" %single)
    else:
        print("%.3f" %single, end=',')

In [None]:
#calculate for OT


In [None]:

#calculate for NT