In [1]:
import os
import pdb
import numpy as np
import argparse
from collections import Counter
from sklearn import svm
import matplotlib.pyplot as plt
import scipy.sparse as sp

# Data Loading and preprocessing

### Loading split information

In [2]:
split_ind = []
with open('../Datasets/SST1_dataset/datasetSplit.txt') as f:
    f.readline()
    for line in f:
        entry = line.split(',')
        split_ind.append(int(entry[1]))

print(len(split_ind))

# Merging validation set to training data
for i in range(len(split_ind)):
    if split_ind[i] == 3:
        split_ind[i] = 1
        
N_train = split_ind.count(1)
N_test = split_ind.count(2)
N_category = 5

11855


### Phrase -> Index

In [3]:
phr_to_ind = dict()

with open('../Datasets/SST1_dataset/dictionary.txt') as f:
    for line in f:
        entry = line.split('|')
        phr_to_ind[entry[0]] = int(entry[1])

keys = phr_to_ind.keys();

print(len(phr_to_ind), phr_to_ind['Good'])

239232 14058


### Loading sentences

In [4]:
# Without doing the below computation directly load the stored output
x_train_sent = []
x_test_sent = []
sentiment = []

counter = 0
with open('../Datasets/SST1_dataset/SentenceWithCorrection.txt') as f:
    for line in f:
        sent = line[:-1]
        if(split_ind[counter] == 1):
            x_train_sent.append(sent)
        else:
            x_test_sent.append(sent)
        
        sentiment.append(phr_to_ind[sent])
        counter += 1

print(len(x_train_sent), len(x_test_sent))

9645 2210


### Loading sentiment information 

In [5]:
ind_to_senti = dict()

with open('../Datasets/SST1_dataset/sentiment_labels.txt') as f:
    f.readline()
    for line in f:
        entry = line.split('|')
        ind_to_senti[int(entry[0])] = float(entry[1])

y_label = []

for ind in sentiment:
    val = ind_to_senti[ind]
    if val >= 0.0 and val <= 0.2:
        y_label.append(0);
    elif val > 0.2 and val <= 0.4:
        y_label.append(1)
    elif val > 0.4 and val <= 0.6:
        y_label.append(2)
    elif val > 0.6 and val <= 0.8:
        y_label.append(3)
    else:
        y_label.append(4)

y_train_org, y_test_org = [], []

for i in range(len(y_label)):
    label = y_label[i]
    if split_ind[i] == 1:
        y_train_org.append(label)
    else:
        y_test_org.append(label)
        
print(len(y_train_org), len(y_test_org))

9645 2210


# Training model

In [6]:
# Tokenize operation
def tokenize(sentence, grams):
    words = sentence.split()
    tokens = []
    for gram in grams:
        for i in range(len(words) - gram + 1):
            tokens += ["_*_".join(words[i:i+gram])]
    return tokens


def compute_ratio(poscounts, negcounts, alpha=1):
    pos_keys = list(poscounts.keys())
    neg_keys = list(negcounts.keys())
    
    alltokens = list(set( pos_keys + neg_keys))
    dic = dict((t, i) for i, t in enumerate(alltokens))
    d = len(dic)
    p, q = np.ones(d) * alpha , np.ones(d) * alpha
    for t in alltokens:
        p[dic[t]] += poscounts[t]
        q[dic[t]] += negcounts[t]
    p /= abs(p).sum()
    q /= abs(q).sum()
    r = np.log(p/q)
    return dic, r

### Creating train and test input data

In [7]:
# ngrams = [1,2,3]
# max_token_num = -1;

# for sent in x_train_sent:
#     tokens = list(set(tokenize(sent, ngrams)))
#     max_token_num = max(max_token_num, len(tokens))


# for sent in x_test_sent:
#     tokens = list(set(tokenize(sent, ngrams)))
#     max_token_num = max(max_token_num, len(tokens))

# X_train = np.zeros((len(x_train_sent), max_token_num), np.float64)
# X_test = np.zeros((len(x_test_sent), max_token_num), np.float64)

# print(X_train.shape, X_test.shape)

In [20]:
f11 = open('train_pos-sent.txt', 'w')
f12 = open('train_neg-sent.txt', 'w')

count = 0
for sent in x_train_sent:
    if y_train_org[count] == 0:
        f11.write(sent + '\n')
    else:
        f12.write(sent + '\n')
    count += 1


f21 = open('test_pos-sent.txt', 'w')
f22 = open('test_neg-sent.txt', 'w')

count = 0
for sent in x_test_sent:
    if y_test_org[count] == 0:
        f21.write(sent + '\n')
    else:
        f22.write(sent + '\n')
    count += 1


In [19]:
ngrams = [1]
Categories = [0,1,2,3,4]

svm_oneVsAll = dict()

N_TRAIN = len(y_train_org)
N_TEST = len(y_test_org)
N_CATEGORIES = len(Categories)

y_train = np.zeros( (N_TRAIN,), np.int16)
y_test  = np.zeros( (N_TEST,),  np.int16)

pred_oneVsAll = np.zeros((N_TEST, N_CATEGORIES), np.int16)

for category in Categories:
    
    for i in range(N_TRAIN):
        if y_train_org[i] == category:
            y_train[i] = 1
        else:
            y_train[i] = -1

        
    for i in range(N_TEST):
        y_test[i] = (1) if y_test_org[i] == category else (-1)
    
    pos_obsv_num = np.sum(y_train == 1)
    neg_obsv_num = np.sum(y_train == -1)
    
    print( 'Training data class distribution:', np.sum(y_train == -1), np.sum(y_train == 1))
    print( 'Test data class distribution:', np.sum(y_test == -1), np.sum(y_test == 1))
    
    ratio = pos_obsv_num / neg_obsv_num
    
    svm_oneVsAll[category] = svm.SVC()
    
    # Getting count of words belonging to positive and negative class
    poscounts = Counter()
    negcounts = Counter()

    counter = 0
    for sent in x_train_sent:
        if y_train[counter] == 1:
            poscounts.update(tokenize(sent, ngrams))
        else:
            negcounts.update(tokenize(sent, ngrams))
        counter += 1

    dic, r = compute_ratio(poscounts, negcounts)
    
    print('Dictionary Size:', len(dic))
    
    vocab_size = len(dic)
    
    x_train = sp.lil_matrix((N_TRAIN, vocab_size), dtype=np.float32)

    
    counter = 0
    for sent in x_train_sent:
        tokens = tokenize(sent, ngrams)
        indexes = []
        for t in tokens:
            if t in dic.keys():
#                 indexes += [dic[t]]
                x_train[counter,dic[t]] += r[dic[t]]
#         indexes = list(set(indexes))

#         for i in indexes:
#             x_train[counter,i] = r[i]
        
        counter += 1 
        
    # Arrange test data
    x_test = sp.lil_matrix((N_TEST, vocab_size), dtype=np.float32)
    
    counter = 0
    for sent in x_test_sent:
        tokens = tokenize(sent, ngrams)
        indexes = []
        for t in tokens:
            if t in dic.keys():
#                 indexes += [dic[t]]
                x_test[counter, dic[t]] += r[dic[t]]
#         indexes = list(set(indexes))

#         for i in indexes:
#             x_test[counter, i] = r[i]
            
        counter += 1
    

    svm_oneVsAll[category].fit(x_train, y_train)
    print('Trained SVM for cateogory: ', category)
    
    pred_train = svm_oneVsAll[category].predict(x_train)
    print( 'Train Accuracy', np.sum(pred_train == y_train)/ N_TRAIN)

    pred_test = svm_oneVsAll[category].predict(x_test)
    print( 'Test Accuracy', np.sum(pred_test == y_test)/ N_TEST)
    
    pred_oneVsAll[:, category] = pred_test
    break
    
    print('------------------------------------------------')

Training data class distribution: 8414 1231
Test data class distribution: 1931 279
Dictionary Size: 19463
Trained SVM for cateogory:  0
Train Accuracy 0.872369103162
Test Accuracy 0.873755656109


In [9]:
dic

{',_*_a_*_thrill': 0,
 'Auteuil_*_is_*_a': 2,
 'Twinkie_*_--': 4,
 'skill_*_of': 5,
 'stillborn_*_except': 8,
 'congratulate_*_himself_*_for': 10,
 'a_*_soufflé_*_gone': 11,
 'Things_*_I_*_Know': 13,
 'in_*_handy': 216729,
 'amateurish_*_,_*_quasi-improvised': 43369,
 'fictional_*_,_*_some': 14,
 'wanton_*_slipperiness_*_of': 237576,
 'sons_*_,_*_and': 15,
 'scenario': 173398,
 'bombards_*_the': 226429,
 'and_*_Cook': 16,
 'strategy': 17,
 'incognito_*_in_*_a': 18,
 'From_*_New': 6,
 'the_*_internal': 19,
 "(_*_Kline_*_'s": 148325,
 'vistas_*_are_*_sweeping': 20,
 'laughable_*_in_*_the': 23,
 'other_*_,_*_but': 22,
 'manifestation_*_of_*_institutionalized': 226684,
 'and_*_a_*_personal': 24,
 'stanzas_*_of': 236757,
 'Coke_*_.': 25,
 'Contradicts': 3,
 'romance_*_you': 26,
 'unemployment_*_,_*_Time': 28,
 'price_*_of_*_popularity': 29,
 'who_*_finds_*_his': 30,
 'best_*_gay_*_love': 237578,
 'has_*_been_*_able': 31,
 'of_*_grace': 32,
 'the_*_preachy_*_Circuit': 33,
 'audiences_*_will'

In [10]:
r

array([ 0.0935504 ,  1.47984476,  0.0935504 , ...,  0.0935504 ,
        0.0935504 ,  1.47984476])

In [13]:
a = svm_oneVsAll[0]
np.sum(a.dual_coef_ == -1)

1231

In [16]:
np.sum(x_test)

-13352.46

In [None]:
pred_maj = np.sum(pred_oneVsAll, axis=1)
print(pred_maj.shape)
for j in range(np.size(x_train,0)):
    count = 0
    for i in np.arange(np.size(x_train,1)):
        if x_train[0,i] != 0:
            count += 1
    print(count)

## Computing $w^{T}x +b$ 

In [None]:
category = 0

for i in range(len(y_train)):
    y_train[i] = (1) if y_train_org[i] == category else (-1)
    
supp_alpha = svm_oneVsAll[0].dual_coef_
supp_vecs = svm_oneVsAll[0].support_vectors_
supp_ind = svm_oneVsAll[0].support_
b = svm_oneVsAll[0].intercept_
# w = np.zeros( (1,np.size(supp_vecs,1)), np.float32)
w = sp.csr_matrix((1, np.size(supp_vecs,1)))

counter = 0
for i in supp_ind:
    w = w + (supp_alpha[0,counter]*y_train[i])*supp_vecs[counter,:]
    counter +=1

In [None]:
np.sum(pred_maj)

In [None]:

for j in range(np.size(x_train,0)):
    count = 0
    for i in np.arange(np.size(x_train,1)):
        if x_train[0,i] != 0:
            count += 1
    print(count)

In [None]:
supp_vecs[1,:].shape

In [None]:
supp_vecs.shape