In [1]:
# One-hot encoding

In [2]:
import numpy as np
np.random.seed(1)

In [3]:
onehots = {}
onehots['cat'] = np.array([1,0,0,0])
onehots['the'] = np.array([0,1,0,0])
onehots['dog'] = np.array([0,0,1,0])
onehots['sat'] = np.array([0,0,0,1])

In [4]:
sentence = ['the', 'cat', 'sat']

In [5]:
encoding = onehots[sentence[0]] + onehots[sentence[1]] + onehots[sentence[2]]

In [6]:
print("Sent encoding: %s" % str(encoding))

Sent encoding: [1 1 0 1]


In [7]:
# Preprocessing

In [8]:
f = open("data/imdb/reviews.txt")
raw_reviews = f.readlines()
f.close()

In [9]:
f = open("data/imdb/labels.txt")
raw_labels = f.readlines()
f.close()

In [10]:
token_sets = [set(filter(lambda token: len(token) > 0, review.split(" "))) for review in raw_reviews]

In [11]:
vocab_set = set()
for token_set in token_sets:
    for word in token_set:
        vocab_set.add(word)
vocab = list(vocab_set)

In [12]:
word2index = {word: i for i, word in enumerate(vocab)}

In [13]:
input_dataset = [list(set([word2index[token] for token in token_set])) for token_set in token_sets]

In [14]:
target_dataset = [int(label == "positive\n") for label in raw_labels]

In [15]:
# Embedding layer

In [16]:
# from random import randint
# num_reviews = 6000
# vocab = range(666)
# input_dataset = [[randint(0, 1) for _ in range(len(vocab))] for _ in range(num_reviews)]
# input_dataset[0:2]
# target_dataset = [randint(0, 1) for _ in range(num_reviews)]

In [17]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [18]:
alpha = 0.01
iterations = 2
hidden_size = 100
weights_0_1 = 0.2 * np.random.random((len(vocab), hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, 1)) - 0.1

In [19]:
correct = 0
total = 0
for iteration in range(iterations):
    for i in range(len(input_dataset) - 1000):
        x = input_dataset[i]
        y = target_dataset[i]

        layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
        layer_2 = sigmoid(layer_1.dot(weights_1_2))
        
        layer_2_delta = layer_2 - y
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)
        
        weights_0_1[x] -= layer_1_delta * alpha
        weights_1_2 -= np.outer(layer_1, layer_2_delta) * alpha
        
        if np.abs(layer_2_delta) < 0.5:
            correct += 1
        total += 1

    if iteration % 1 == 0:
        accuracy = correct/total
        print("I:%d Training accuracy:%f" % (i, accuracy))

I:23999 Training accuracy:0.832000
I:23999 Training accuracy:0.866354


In [20]:
correct = 0
total = 0

for i in range(len(input_dataset) - 1000, len(input_dataset)):
    x = input_dataset[i]
    y = target_dataset[i]

    layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
    layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
    
    if np.abs(layer_2 - y) < 0.5:
        correct += 1
    total += 1

accuracy = correct/total
print("Test accuracy: %f" % accuracy)

Test accuracy: 0.852000


In [21]:
# Visualising weight similarity

In [22]:
from collections import Counter
import math

In [23]:
def similar(target):
    target_index = word2index[target]
    scores = Counter()
    for word, i in word2index.items():
        raw_difference = weights_0_1[i] - weights_0_1[target_index]
        squared_difference = raw_difference ** 2
        scores[word] = -math.sqrt(sum(squared_difference))
    
    return scores.most_common(10)

In [24]:
print(similar('beautiful'))

[('beautiful', -0.0), ('delightful', -0.7133502715095305), ('appreciated', -0.7310171593681725), ('fun', -0.7369190245267592), ('beautifully', -0.7399257218009643), ('sent', -0.7422146366246041), ('highly', -0.7452677172326826), ('available', -0.7466535150483408), ('cry', -0.7511882418835696), ('hilarious', -0.7574218781546053)]


In [25]:
print(similar('terrible'))

[('terrible', -0.0), ('fails', -0.7513469884730295), ('dull', -0.7712501481783576), ('avoid', -0.7733593396719871), ('annoying', -0.7985918300886892), ('lacks', -0.8075376224077705), ('mess', -0.8075708896763902), ('redeeming', -0.8236089249033156), ('poor', -0.8240048422924088), ('disappointing', -0.8259852293379486)]


In [26]:
# Filling in the blank

In [27]:
import random
random.seed(1)

In [28]:
tokens = list(map(lambda x:x.split(" "), raw_reviews))
wordcount = Counter()

for sent in tokens:
    for word in sent:
        wordcount[word] -= 1

vocab = list(set(map(lambda x:x[0], wordcount.most_common())))
word2index = {word: i for i, word in enumerate(vocab)}

In [29]:
concatenated = list()
input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        sent_indices.append(word2index[word])
        concatenated.append(word2index[word])
    input_dataset.append(sent_indices)
    
concatenated = np.array(concatenated)

In [30]:
random.shuffle(input_dataset)

In [31]:
alpha = 0.05
iterations = 2
hidden_size = 50
window = 2
negative = 5

weights_0_1 = (np.random.rand(len(vocab), hidden_size) - 0.5) * 0.2
weights_1_2 = np.random.rand(len(vocab), hidden_size) * 0

In [32]:
layer_2_target = np.zeros(negative + 1)
layer_2_target[0] = 1

In [33]:
for review_i, review in enumerate(input_dataset * iterations):
    for target_i in range(len(review)):
        target_samples = [review[target_i]] + list(concatenated[(np.random.rand(negative)*len(concatenated)).astype('int').tolist()])
        left_context = review[max(0, target_i-window):target_i]
        right_context = review[target_i+1:min(len(review), target_i+window)]
    
        layer_1 = np.mean(weights_0_1[left_context+right_context], axis=0)
        layer_2 = sigmoid(layer_1.dot(weights_1_2[target_samples].T))
        layer_2_delta = layer_2 - layer_2_target
        layer_1_delta = layer_2_delta.dot(weights_1_2[target_samples])
        
        weights_0_1[left_context+right_context] -= layer_1_delta * alpha
        weights_1_2[target_samples] -= np.outer(layer_2_delta, layer_1) * alpha
        
    if review_i % 250 == 0:
        print("Progress:%f %s\n" % (review_i / len(input_dataset * iterations), str(similar('terrible'))))

Progress:0.000000 [('terrible', -0.0), ('dadaist', -0.3560406276674067), ('counterbalancing', -0.3611067459837245), ('headley', -0.3679147406942775), ('ennobling', -0.36878247314345514), ('nicholette', -0.3703847083028422), ('wendel', -0.3726105382130416), ('duryea', -0.37400762631273726), ('squabble', -0.38176151418056453), ('rwtd', -0.38336877562774613)]

Progress:0.005000 [('terrible', -0.0), ('vicenzo', -0.49165400965934686), ('prince', -0.4927664327351607), ('seven', -0.5106935792474491), ('duryea', -0.5138560214349656), ('individual', -0.5194230738053703), ('pulled', -0.5252411518413801), ('wars', -0.5291864961731348), ('baby', -0.531591489776913), ('veteran', -0.5337380163272933)]

Progress:0.010000 [('terrible', -0.0), ('believable', -0.6333339508611329), ('batman', -0.6404136335462441), ('exciting', -0.6533683766809056), ('hilarious', -0.6651275291768081), ('c', -0.6678931578238068), ('visually', -0.6715307394413343), ('mattei', -0.6865790310020363), ('martin', -0.692221011353

Progress:0.120000 [('terrible', -0.0), ('brilliant', -2.2021540788774594), ('horrible', -2.5257699482885037), ('remarkable', -2.5277558184234974), ('dreadful', -2.528285746006027), ('weak', -2.531349589283113), ('thin', -2.5528756414292935), ('lame', -2.574693807591052), ('magnificent', -2.6844705454800524), ('fantastic', -2.7218752124814487)]

Progress:0.125000 [('terrible', -0.0), ('brilliant', -2.2585154168396455), ('weak', -2.6617593556890187), ('dreadful', -2.6755377476955693), ('remarkable', -2.699447941925009), ('horrible', -2.704491938735244), ('lame', -2.7239266357986893), ('terrific', -2.807788714540714), ('fantastic', -2.8622967622774804), ('magnificent', -2.8724651705247797)]

Progress:0.130000 [('terrible', -0.0), ('brilliant', -2.2569867699078534), ('weak', -2.5220177383295246), ('lame', -2.5401986247474735), ('dreadful', -2.542781614438868), ('remarkable', -2.6049532373074387), ('magnificent', -2.7191035728989643), ('fantastic', -2.7312793337945145), ('thin', -2.73653660

Progress:0.240000 [('terrible', -0.0), ('brilliant', -2.861632525766515), ('horrible', -3.1148963035302413), ('remarkable', -3.1897623798132875), ('superb', -3.3139769658872598), ('fantastic', -3.3558033806009155), ('dreadful', -3.3638740961812394), ('stupid', -3.40018759044737), ('wonderful', -3.4331927708636014), ('predictable', -3.5155978495260753)]

Progress:0.245000 [('terrible', -0.0), ('horrible', -2.8651397078641696), ('brilliant', -3.1427442243402255), ('stupid', -3.210268938324467), ('remarkable', -3.292648570972057), ('fantastic', -3.3125482926161993), ('dreadful', -3.5183327703924565), ('superb', -3.544780503204791), ('lame', -3.5526172299267063), ('stunning', -3.6116139451248626)]

Progress:0.250000 [('terrible', -0.0), ('horrible', -3.0472017365365898), ('brilliant', -3.2481353475241748), ('stupid', -3.3363302846247525), ('remarkable', -3.4976510379432684), ('fantastic', -3.530975540418879), ('lame', -3.5824029456370265), ('dreadful', -3.6065057061012045), ('stunning', -3

Progress:0.360000 [('terrible', -0.0), ('horrible', -2.9359366348042513), ('brilliant', -3.4652629903891774), ('dreadful', -3.632766241392901), ('fabulous', -3.9068607583646493), ('stunning', -3.9239192784697154), ('superb', -3.933881312967876), ('horrendous', -4.07275751927277), ('lame', -4.100466544524692), ('breathtaking', -4.116290140540972)]

Progress:0.365000 [('terrible', -0.0), ('horrible', -2.9337008759990213), ('brilliant', -3.4236990868553714), ('dreadful', -3.608118018940969), ('stunning', -3.8571570713496026), ('fabulous', -3.880587033418197), ('superb', -3.9662764261067487), ('bad', -4.002211070546931), ('breathtaking', -4.0783909926422925), ('horrendous', -4.111257083027296)]

Progress:0.370000 [('terrible', -0.0), ('horrible', -2.698261508186241), ('brilliant', -3.4359370439927877), ('bad', -3.6224051902153995), ('dreadful', -3.635475137972518), ('fabulous', -3.85830344068321), ('stunning', -3.9402125434669593), ('ridiculous', -3.9775882645258527), ('breathtaking', -4.0

Progress:0.480000 [('terrible', -0.0), ('horrible', -3.259992366169905), ('brilliant', -3.8934822334465196), ('superb', -3.9312050066971422), ('wonderful', -4.077101670490208), ('terrific', -4.130954158177945), ('dreadful', -4.164684284834005), ('fantastic', -4.170503756061025), ('fabulous', -4.232033532864423), ('tremendous', -4.250464203701715)]

Progress:0.485000 [('terrible', -0.0), ('horrible', -3.4169914146844937), ('brilliant', -3.8083804113748614), ('superb', -3.837398752891373), ('wonderful', -4.066016091566484), ('terrific', -4.101548955950487), ('tremendous', -4.139998998255404), ('fine', -4.173197769237539), ('fantastic', -4.175590032844427), ('fabulous', -4.238798894830774)]

Progress:0.490000 [('terrible', -0.0), ('horrible', -3.3168217239574873), ('superb', -3.7228259528393184), ('brilliant', -3.7548098758102197), ('terrific', -3.9989316457762247), ('fine', -4.043352530307692), ('fantastic', -4.0777276982446296), ('tremendous', -4.086040975119331), ('wonderful', -4.10620

Progress:0.600000 [('terrible', -0.0), ('horrible', -3.3607822144964214), ('fantastic', -3.673288901049267), ('brilliant', -3.783926995286968), ('horrendous', -3.8817491538439164), ('dreadful', -3.9619811526306563), ('superb', -4.064688135077285), ('haunting', -4.083449991706081), ('ridiculous', -4.144323762141075), ('spectacular', -4.14996186955877)]

Progress:0.605000 [('terrible', -0.0), ('horrible', -3.2710706653039785), ('fantastic', -3.5488322244715858), ('brilliant', -3.7103788920274954), ('horrendous', -3.761083637646685), ('dreadful', -3.820967923181581), ('haunting', -3.955443867099345), ('superb', -4.034141330036329), ('ridiculous', -4.090567390072655), ('spectacular', -4.119313990974403)]

Progress:0.610000 [('terrible', -0.0), ('horrible', -3.242777090029577), ('fantastic', -3.634720637348726), ('brilliant', -3.8008379923578945), ('dreadful', -3.942622265066894), ('horrendous', -3.943749681199538), ('superb', -4.0117322229272885), ('haunting', -4.125957711400188), ('specta

Progress:0.720000 [('terrible', -0.0), ('horrible', -2.8855944962404183), ('spectacular', -3.8834590129967683), ('horrendous', -3.92247035528734), ('dreadful', -3.9771446166144337), ('superb', -3.9907195128815953), ('fantastic', -4.000139507802868), ('brilliant', -4.0008219822751165), ('hilarious', -4.063114885980841), ('stupid', -4.1037695528944464)]

Progress:0.725000 [('terrible', -0.0), ('horrible', -2.8731559135017397), ('brilliant', -3.8409612829144915), ('dreadful', -3.8984702946269834), ('superb', -4.040989923358224), ('horrendous', -4.055932957143123), ('spectacular', -4.062242934353335), ('bad', -4.069361993298025), ('fantastic', -4.177156548412016), ('stupid', -4.211837836938103)]

Progress:0.730000 [('terrible', -0.0), ('horrible', -2.8691299440281606), ('dreadful', -3.805130176456905), ('brilliant', -3.8700994233927046), ('horrendous', -4.02533959280766), ('spectacular', -4.062133222350337), ('bad', -4.13208339294299), ('fantastic', -4.145027522331349), ('great', -4.215235

Progress:0.840000 [('terrible', -0.0), ('horrible', -3.0256887045622687), ('brilliant', -3.3456563663543983), ('horrendous', -3.553864814269606), ('dreadful', -3.666016632316138), ('marvelous', -3.857608660969116), ('horrid', -3.9090385961159075), ('fabulous', -3.9410196930501664), ('laughable', -3.9901844465787084), ('superb', -4.0316132515586105)]

Progress:0.845000 [('terrible', -0.0), ('brilliant', -3.1522139740424113), ('horrible', -3.166809384937746), ('horrendous', -3.474359718917541), ('dreadful', -3.549923275148204), ('horrid', -3.9030750330545145), ('marvelous', -3.9150371417026526), ('lame', -3.9650394233015684), ('fabulous', -3.9866896503054763), ('laughable', -3.995331673221079)]

Progress:0.850000 [('terrible', -0.0), ('horrible', -3.0010292027461793), ('brilliant', -3.1301214193666613), ('horrendous', -3.4976995845415453), ('dreadful', -3.5512304350013317), ('marvelous', -3.7439129708659333), ('bad', -3.7520143564519595), ('fabulous', -3.8985393596748206), ('lame', -3.96

Progress:0.960000 [('terrible', -0.0), ('horrible', -2.9068219176652836), ('superb', -3.4556773995446264), ('brilliant', -3.5671589172756866), ('dreadful', -3.7654143651920746), ('horrendous', -3.818250090766798), ('spectacular', -3.9882322804288135), ('great', -4.048646132672695), ('fantastic', -4.07495775970486), ('fabulous', -4.117765675338482)]

Progress:0.965000 [('terrible', -0.0), ('horrible', -3.0190358590503528), ('superb', -3.4705936333839507), ('brilliant', -3.580379147984176), ('dreadful', -3.783985717103459), ('horrendous', -3.94469007435039), ('spectacular', -4.019232416175871), ('fantastic', -4.078416517019524), ('fabulous', -4.089831402580275), ('great', -4.127467113755128)]

Progress:0.970000 [('terrible', -0.0), ('horrible', -2.9373519924313265), ('superb', -3.526493440448034), ('brilliant', -3.644421086101592), ('dreadful', -3.718101164448732), ('horrendous', -3.9814539459602294), ('great', -3.9965062179118513), ('spectacular', -4.065799726088611), ('fabulous', -4.11

In [34]:
# Word analogies

In [35]:
def analogy(positive, negative):
    norms = np.sum(weights_0_1 ** 2, axis=1)
    norms.resize(norms.shape[0], 1)
    normed_weights = weights_0_1 * norms
    
    query_vector = np.zeros(len(weights_0_1[0]))
    for word in positive:
        query_vector += normed_weights[word2index[word]]
    for word in negative:
        query_vector -= normed_weights[word2index[word]]
    
    scores = Counter()
    for word, i in word2index.items():
        raw_difference = weights_0_1[i] - query_vector
        squared_difference = raw_difference ** 2
        scores[word] = -math.sqrt(sum(squared_difference))
    
    return scores.most_common(10)

In [36]:
analogy(['terrible', 'good'], ['bad'])

[('decent', -189.24497218630245),
 ('good', -189.40364931412063),
 ('worth', -189.477782315034),
 ('fine', -189.67257301754123),
 ('worthy', -189.7716826248772),
 ('pleasant', -189.85499939007755),
 ('superb', -189.95415512757486),
 ('perfect', -189.98373768423164),
 ('great', -189.99122355254102),
 ('nice', -190.08211662179244)]

In [37]:
analogy(['elizabeth', 'he'], ['she'])

[('he', -206.749151932014),
 ('she', -209.28216527020243),
 ('it', -209.4725124569001),
 ('role', -209.6277016546557),
 ('ms', -209.65087721617894),
 ('been', -209.73320552006405),
 ('william', -209.77147411228708),
 ('someone', -209.89069281525587),
 ('lee', -209.89793083449314),
 ('heston', -209.91020568842728)]