In [5]:
import os
import pdb
import numpy as np
import argparse
from collections import Counter
from sklearn import svm
import matplotlib.pyplot as plt
import scipy.sparse as sp

# Data Loading and preprocessing

### Loading split information

In [6]:
split_ind = []
with open('../Datasets/SST1_dataset/datasetSplit.txt') as f:
    f.readline()
    for line in f:
        entry = line.split(',')
        split_ind.append(int(entry[1]))

print(len(split_ind))

# Merging validation set to training data
for i in range(len(split_ind)):
    if split_ind[i] == 3:
        split_ind[i] = 1
        
N_train = split_ind.count(1)
N_test = split_ind.count(2)
N_category = 5

11855


### Phrase -> Index

In [7]:
phr_to_ind = dict()

with open('../Datasets/SST1_dataset/dictionary.txt') as f:
    for line in f:
        entry = line.split('|')
        phr_to_ind[entry[0]] = int(entry[1])

keys = phr_to_ind.keys();

print(len(phr_to_ind), phr_to_ind['Good'])

239232 14058


### Loading sentences

In [8]:
# Without doing the below computation directly load the stored output
x_train_sent = []
x_test_sent = []
sentiment = []

counter = 0
with open('../Datasets/SST1_dataset/SentenceWithCorrection.txt') as f:
    for line in f:
        sent = line[:-1]
        if(split_ind[counter] == 1):
            x_train_sent.append(sent)
        else:
            x_test_sent.append(sent)
        
        sentiment.append(phr_to_ind[sent])
        counter += 1

print(len(x_train_sent), len(x_test_sent))

9645 2210


### Loading sentiment information 

In [9]:
ind_to_senti = dict()

with open('../Datasets/SST1_dataset/sentiment_labels.txt') as f:
    f.readline()
    for line in f:
        entry = line.split('|')
        ind_to_senti[int(entry[0])] = float(entry[1])

y_label = []

for ind in sentiment:
    val = ind_to_senti[ind]
    if val >= 0.0 and val <= 0.2:
        y_label.append(0);
    elif val > 0.2 and val <= 0.4:
        y_label.append(1)
    elif val > 0.4 and val <= 0.6:
        y_label.append(2)
    elif val > 0.6 and val <= 0.8:
        y_label.append(3)
    else:
        y_label.append(4)

y_train_org, y_test_org = [], []

for i in range(len(y_label)):
    label = y_label[i]
    if split_ind[i] == 1:
        y_train_org.append(label)
    else:
        y_test_org.append(label)
        
print(len(y_train_org), len(y_test_org))

9645 2210


# Training model

In [10]:
# Tokenize operation
def tokenize(sentence, grams):
    words = sentence.split()
    tokens = []
    for gram in grams:
        for i in range(len(words) - gram + 1):
            tokens += ["_*_".join(words[i:i+gram])]
    return tokens


def compute_ratio(poscounts, negcounts, alpha=1):
    pos_keys = list(poscounts.keys())
    neg_keys = list(negcounts.keys())
    
    alltokens = list(set( pos_keys + neg_keys))
    dic = dict((t, i) for i, t in enumerate(alltokens))
    d = len(dic)
    p, q = np.ones(d) * alpha , np.ones(d) * alpha
    for t in alltokens:
        p[dic[t]] += poscounts[t]
        q[dic[t]] += negcounts[t]
    p /= abs(p).sum()
    q /= abs(q).sum()
    r = np.log(p/q)
    return dic, r

### Creating train and test input data

In [11]:
ngrams = [1,2,3]
max_token_num = -1;

# for sent in x_train_sent:
#     tokens = list(set(tokenize(sent, ngrams)))
#     max_token_num = max(max_token_num, len(tokens))


# for sent in x_test_sent:
#     tokens = list(set(tokenize(sent, ngrams)))
#     max_token_num = max(max_token_num, len(tokens))

# X_train = np.zeros((len(x_train_sent), max_token_num), np.float64)
# X_test = np.zeros((len(x_test_sent), max_token_num), np.float64)

# print(X_train.shape, X_test.shape)

NameError: name 'X_train' is not defined

In [19]:
ngrams = [1,2,3]
Categories = [0,1,2,3,4]

svm_oneVsAll = dict()
for category in Categories:
    svm_oneVsAll[category] = svm.SVC()

y_train = np.zeros( (len(y_train_org),), np.int8)
y_test = np.zeros( (len(y_test_org),), np.int8)
    
pred_oneVsAll = np.zeros((len(x_test_sent), len(Categories)), np.int8)

for category in Categories:
    
    for i in range(len(y_train)):
        y_train[i] = (0) if y_train_org[i] == category else (1)

    for i in range(len(y_test)):
        y_test[i] = (0) if y_test_org[i] == category else (1)
    
    print( 'Training data class distribution:', np.sum(y_train == 0), np.sum(y_train == 1))
    print( 'Test data class distribution:', np.sum(y_test == 0), np.sum(y_test == 1))
    # Getting count of words belonging to positive and negative class
    poscounts = Counter()
    negcounts = Counter()

    counter = 0
    for sent in x_train_sent:
        if y_train[counter] == 0:
            poscounts.update(tokenize(sent, ngrams))
        else:
            negcounts.update(tokenize(sent, ngrams))
        counter += 1

    dic, r = compute_ratio(poscounts, negcounts)
    
    
    x_train = sp.csr_matrix((len(x_train_sent), len(dic)), dtype=np.float32)
    
    counter = 0
    for sent in x_train_sent:
        tokens = tokenize(sent, ngrams)
        indexes = []
        for t in tokens:
            try:
                indexes += [dic[t]]
            except KeyError:
                pass
        indexes = list(set(indexes))
        indexes.sort()

        for i in indexes:
            x_train[counter,i] = r[i]
        
    # Arrange test data
    x_test = sp.csr_matrix((len(x_test_sent), len(dic)), dtype=np.float32)
    
    counter = 0
    for sent in x_test_sent:
        tokens = tokenize(sent, ngrams)
        indexes = []
        for t in tokens:
            try:
                indexes += [dic[t]]
            except KeyError:
                pass
        indexes = list(set(indexes))
        indexes.sort()

        data = []
        for i in indexes:
            x_test[counter, i] = r[i]
            
        counter = counter + 1
    
    for i in range(len(x_train)):
        res = x_train[i]
        X_train[i, :len(res)] = np.float64(res)
        
    for i in range(len(x_test)):
        res = x_test[i]
        X_test[i, :len(res)] = np.float64(res)

    svm_oneVsAll[category].fit(X_train, y_train)
    print('Trained SVM for cateogory: ', category)
    
    pred_train = svm_oneVsAll[category].predict(X_train)
    print( 'Train Accuracy', np.sum(pred_train == y_train)/ len(y_train))

    pred_test = svm_oneVsAll[category].predict(X_test)
    print( 'Test Accuracy', np.sum(pred_test == y_test)/ len(y_test))
    
    pred_oneVsAll[:, category] = pred_test
    
    print('------------------------------------------------')


Training data class distribution: 1231 8414
Test data class distribution: 279 1931




KeyboardInterrupt: 

In [13]:
pred_maj = np.sum(pred_oneVsAll, axis=1)
print(pred_maj.shape)

(2210,)


In [14]:
pred_maj

array([0, 0, 0, ..., 0, 0, 0])