In [1]:
import numpy as np
from collections import defaultdict

In [2]:
def get_data(data_path):
    with open(data_path, 'r') as f:
        lines = f.read().splitlines()
    X = []
    Y = []
    for line in lines:    # one doc
        single_dict = defaultdict(lambda:0)
        raw_X = line.split('<fff>')[-1]
        for pair in raw_X.split():
            idx = pair.split(':')[0]
            value = pair.split(':')[1]
            single_dict[int(idx)] = float(value)
            
        X.append(single_dict)
        Y.append(float(line.split('<fff>')[0]))
    return X, Y

In [3]:
X_train, Y_train = get_data("D:\\movedFromC\\123\\20192\\PRJ2\\Project2\\words_tfidfs.txt")

In [4]:
X_test, Y_test = get_data("D:\\movedFromC\\123\\20192\\PRJ2\\Project2\\words_tfidfs_test.txt")

In [5]:
import scipy.sparse as sp

# incrementally create doc sparse matrix for X_train => convert it to csr sparse matrix
def convert_to_csr(X_dict, num_of_features):
    X_mat = sp.dok_matrix((len(X_dict), num_of_features), dtype=np.float)
    for i in range(len(X_dict)):
        for key in X_dict[i].keys():
            X_mat[i, key] = X_dict[i][key]
    X_mat = X_mat.tocsr()

    return X_mat

In [6]:
X_train_mat = convert_to_csr(X_train, 10272)
X_test_mat = convert_to_csr(X_test, 10272)

In [9]:
import time

def compute_accuracy(prediction, ground_truth):
    matches = np.equal(prediction, ground_truth)
    accuracy = np.sum(matches.astype(float)) / len(ground_truth)
    return accuracy

In [12]:
from sklearn.svm import LinearSVC


C_range_linear = [0.001, 0.01, 0.1, 1, 10, 100]
acc_max_linear = 0
cb_linear = 0
total_time_linear = 0

for c in C_range_linear:
    print('C = ' + str(c))
    classifier = LinearSVC(C=c)
    time_start = time.time()
    classifier.fit(X_train_mat, Y_train)
    prediction = classifier.predict(X_test_mat)
    time_end = time.time()
    accuracy = compute_accuracy(prediction, Y_test)
    
    if accuracy > acc_max_linear:
        acc_max_linear = accuracy
        cb_linear = c
        print('Better, acc: ' + str(accuracy))
        total_time_linear = time_end - time_start
   
print('-------')
print(cb_linear)
print(acc_max_linear)
print('time:' + str(total_time_linear))

C = 0.001
Better, acc: 0.7600902814657462
C = 0.01
Better, acc: 0.7980616038236856
C = 0.1
Better, acc: 0.8291290493892725
C = 1
Better, acc: 0.8312533191715348
C = 10
C = 100
-------
1
0.8312533191715348
time:1.0875484943389893


In [None]:
from sklearn.svm import SVC


# finetune on hyperparameters C and Kernel 
C_range = [0.001, 0.01, 0.1, 1, 10, 100]
kernel_range = ['rbf','poly','sigmoid','linear']
acc_max=0
kb = 'x' 
cb = 0
total_time = 0

for ker in kernel_range:
    for c in C_range:
        print(str(ker) + ' C = ' + str(c))
        svc = SVC(kernel=ker, C=c)
        time_start = time.time()
        svc.fit(X_train_mat, Y_train)
        Y_pred = svc.predict(X_test_mat)
        time_end = time.time()
        acc = compute_accuracy(Y_pred, Y_test)
        
        if acc > acc_max:
            acc_max = acc
            cb = c
            kb = ker
            print('Better, acc: ' + str(acc_max))
            total_time = time_end - time_start
            
print('Best kernel SVM:' + str(kb) + ' C = ' + str(cb))
print('Time: ' + str(total_time))        




rbf C = 0.001
Better, acc: 0.05297397769516728
rbf C = 0.01
rbf C = 0.1
Better, acc: 0.48632501327668615
rbf C = 1
Better, acc: 0.8173127987254382
rbf C = 10
Better, acc: 0.8211630377057887
rbf C = 100
poly C = 0.001
poly C = 0.01
poly C = 0.1
poly C = 1
poly C = 10
