In [1]:
import numpy as np 

onehots = {}
onehots['cat'] = np.array([1, 0, 0, 0])
onehots['the'] = np.array([0, 1, 0, 0])
onehots['dog'] = np.array([0, 0, 1, 0])
onehots['sat'] = np.array([0, 0, 0, 1])

sentence = ['the', 'cat', 'sat']
x = onehots[sentence[0]] + onehots[sentence[1]] + onehots[sentence[2]]
print(f'Sent Encoding:{x}')

Sent Encoding:[1 1 0 1]


In [2]:
import sys
f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

In [3]:
f = open('labels.txt')
raw_labels = f.readlines()
f.close()

In [4]:
tokens = list(map(lambda x: set(x.split(" ")), raw_reviews))

In [5]:
vocab = set()

for sent in tokens:
    for word in sent:
        if(len(word) > 0):
            vocab.add(word)
vocab = list(vocab)

In [6]:
word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i

input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sent_indices)))

In [9]:
target_dataset = list()
for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)
    

In [7]:
import numpy as np 
np.random.seed(1)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

alpha = 0.01
iterations = 2
hidden_size = 100

weights_0_1 = 0.2 * np.random.random((len(vocab), hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, 1)) - 0.1

correct = 0
total = 0


In [10]:
for epoch in range(iterations):
    for i in range(len(input_dataset) - 1000):
        x, y = (input_dataset[i], target_dataset[i])
        layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
        
        layer_2_delta = layer_2 - y
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)
        
        weights_0_1[x]-= layer_1_delta * alpha
        weights_1_2 -= np.outer(layer_1, layer_2_delta) * alpha
        
        if(np.abs(layer_2_delta) < 0.5):
            correct += 1
        total += 1
        
        if(i % 10 == 9):
            progress = str(i/float(len(input_dataset)))
            sys.stdout.write('Iter:'+str(epoch)+' Progress:'+progress[2:4]+'.'+progress[4:6]+'% Training Accuracy:'+ str(correct/float(total)) + '%')
        print()

    correct,total = (0,0)
    for i in range(len(input_dataset)-1000,len(input_dataset)):
        x = input_dataset[i]
        y = target_dataset[i]
        layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0))
        layer_2 = sigmoid(np.dot(layer_1,weights_1_2))

        if(np.abs(layer_2 - y) < 0.5):
            correct += 1
        total += 1
    print("Test Accuracy:" + str(correct / float(total)))










Iter:0 Progress:00.03% Training Accuracy:0.0%









Iter:0 Progress:00.07% Training Accuracy:0.2%









Iter:0 Progress:00.11% Training Accuracy:0.23333333333333334%









Iter:0 Progress:00.15% Training Accuracy:0.225%









Iter:0 Progress:00.19% Training Accuracy:0.2%









Iter:0 Progress:00.23% Training Accuracy:0.21666666666666667%









Iter:0 Progress:00.27% Training Accuracy:0.2%









Iter:0 Progress:00.31% Training Accuracy:0.1875%









Iter:0 Progress:00.35% Training Accuracy:0.2%









Iter:0 Progress:00.39% Training Accuracy:0.21%









Iter:0 Progress:00.43% Training Accuracy:0.23636363636363636%









Iter:0 Progress:00.47% Training Accuracy:0.24166666666666667%









Iter:0 Progress:00.51% Training Accuracy:0.23076923076923078%









Iter:0 Progress:00.55% Training Accuracy:0.24285714285714285%









Iter:0 Progress:00.59% Training Accuracy:0.25333333333333335%









Iter:0 Progress:00.63% Training Accuracy:0.26875

In [11]:
from collections import Counter
import math

def similar(target='beautiful'):
    target_index = word2index[target]
    scorec = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - weights_0_1[target_index]
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)

In [None]:
similar()