In [1]:
import os
import pdb
import numpy as np
import argparse
from collections import Counter
from sklearn import svm
import matplotlib.pyplot as plt

# Data Loading and preprocessing

### Loading split information

In [2]:
split_ind = []
with open('../Datasets/SST1_dataset/datasetSplit.txt') as f:
    f.readline()
    for line in f:
        entry = line.split(',')
        split_ind.append(int(entry[1]))

print(len(split_ind))

# Merging validation set to training data
for i in range(len(split_ind)):
    if split_ind[i] == 3:
        split_ind[i] = 1
        
N_train = split_ind.count(1)
N_test = split_ind.count(2)
N_category = 5

11855


### Phrase -> Index

In [3]:
phr_to_ind = dict()

with open('../Datasets/SST1_dataset/dictionary.txt') as f:
    for line in f:
        entry = line.split('|')
        phr_to_ind[entry[0]] = int(entry[1])

keys = phr_to_ind.keys();

print(len(phr_to_ind), phr_to_ind['Good'])

239232 14058


### Loading sentences

In [10]:
# Without doing the below computation directly load the stored output
x_train_sent = []
x_test_sent = []
sentiment = []

counter = 0
with open('../Datasets/SST1_dataset/SentenceWithCorrection.txt') as f:
    for line in f:
        sent = line[:-1]
        if(split_ind[counter] == 1):
            x_train_sent.append(sent)
        else:
            x_test_sent.append(sent)
        
        sentiment.append(phr_to_ind[sent])
        counter += 1

print(len(x_train_sent), len(x_test_sent))

9645 2210


### Loading sentiment information 

In [11]:
ind_to_senti = dict()

with open('../Datasets/SST1_dataset/sentiment_labels.txt') as f:
    f.readline()
    for line in f:
        entry = line.split('|')
        ind_to_senti[int(entry[0])] = float(entry[1])

y_label = []

for ind in sentiment:
    val = ind_to_senti[ind]
    if val >= 0.0 and val <= 0.2:
        y_label.append(0);
    elif val > 0.2 and val <= 0.4:
        y_label.append(1)
    elif val > 0.4 and val <= 0.6:
        y_label.append(2)
    elif val > 0.6 and val <= 0.8:
        y_label.append(3)
    else:
        y_label.append(4)

y_train, y_test = [], []

for i in range(len(y_label)):
    label = y_label[i]
    if split_ind[i] == 1:
        y_train.append(label)
    else:
        y_test.append(label)
        
print(len(y_train), len(y_test))

9645 2210


# Training model

In [12]:
# Tokenize operation
def tokenize(sentence, grams):
    words = sentence.split()
    tokens = []
    for gram in grams:
        for i in range(len(words) - gram + 1):
            tokens += ["_*_".join(words[i:i+gram])]
    return tokens


def compute_ratio(poscounts, negcounts, alpha=1):
    pos_keys = list(poscounts.keys())
    neg_keys = list(negcounts.keys())
    
    alltokens = list(set( pos_keys + neg_keys))
    dic = dict((t, i) for i, t in enumerate(alltokens))
    d = len(dic)
    p, q = np.ones(d) * alpha , np.ones(d) * alpha
    for t in alltokens:
        p[dic[t]] += poscounts[t]
        q[dic[t]] += negcounts[t]
    p /= abs(p).sum()
    q /= abs(q).sum()
    r = np.log(p/q)
    return dic, r

### Creating train and test input data

In [15]:
ngrams = [1,2,3]

# Getting count of words belonging to positive and negative class
poscounts = Counter()
negcounts = Counter()

counter = 0
for sent in x_train_sent:
    if y_train[counter] == 0:
        poscounts.update(tokenize(sent, ngrams))
    else:
        negcounts.update(tokenize(sent, ngrams))
    counter += 1

dic, r = compute_ratio(poscounts, negcounts)

In [17]:
# Arrange train data
x_train = []
for sent in x_train_sent:
    tokens = tokenize(sent, ngrams)
    indexes = []
    for t in tokens:
        try:
            indexes += [dic[t]]
        except KeyError:
            pass
    indexes = list(set(indexes))
    indexes.sort()

    data = []
    for i in indexes:
        data.append(r[i])
    x_train.append(data)

# Arrange test data
x_test = []

for sent in x_test_sent:
    tokens = tokenize(sent, ngrams)
    indexes = []
    for t in tokens:
        try:
            indexes += [dic[t]]
        except KeyError:
            pass
    indexes = list(set(indexes))
    indexes.sort()

    data = []
    for i in indexes:
        data.append(r[i])
    x_test.append(data)

# Get max sentence length
max_sent_len = -1
for i in range(len(x_train)):
    sent_len = len(x_train[i])
    max_sent_len = max(max_sent_len, sent_len)

for i in range(len(x_test)):
    sent_len = len(x_test[i])
    max_sent_len = max(max_sent_len, sent_len)
    
print('Max sentence length', max_sent_len)

X_train = np.zeros( (len(x_train_sent), max_sent_len), np.float64)
for i in range(len(x_train)):
    res = x_train[i]
    X_train[i, :len(res)] = np.float64(res)

X_test = np.zeros( (len(x_test_sent), max_sent_len), np.float64)
for i in range(len(x_test)):
    res = x_test[i]
    X_test[i, :len(res)] = np.float64(res)
    
print(X_train.shape, X_test.shape)

Max sentence length 147
(9645, 147) (2210, 147)


In [18]:
for i in range(len(y_label)):
    y_label[i] = (0) if y_label[i] == 0 else (1)

svm_class = svm.SVC()
svm_class.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [None]:
pred = svm_class.predict(X_train)
print( 'Train Accuracy', np.sum(pred == y_train)/ len(y_train))

pred_test = svm_class.predict(X_test)
print( 'Test Accuracy', np.sum(pred_test == y_test)/ len(y_test))