In [1]:
import torch.nn
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
from collections import Counter
# import string library function 
import string 
from nltk.corpus import stopwords

In [2]:
# Storing the sets of punctuation in variable result 
result = string.punctuation

file1 = open('data/dialogues_text.txt', 'r')
lines = file1.readlines()
sentences = []
for line in lines:
    sentences.extend(line.split("__eou__")[:-1])


In [3]:
punc = string.punctuation + "’"
def clear_punctuation(str):    
    return ''.join([char if char not in punc else " " for char in str])

def clear_the(raw_words):
    return [word.lower() for word in raw_words if word.lower() not in nonsense_words and word!='']

nonsense_words = set(stopwords.words('english'))  

def generate_word_pairs(words, window):
    pairs = []
    for i in range(len(words)):
        for j in range(i-window,i+window+1):
            if j<0 or j>len(words)-1 or i==j:
                continue
            pairs.append((word_dict[words[i]],word_dict[words[j]]))
    return pairs


word_dict = {}
all_word_pairs = []
word_freq = {}
index = 0
for i in range(len(sentences)):
    punc_cleared_sentence = clear_punctuation(sentences[i])
    raw_words = punc_cleared_sentence.split(" ")
    clean_words = clear_the(raw_words)
    for word in clean_words:
        if word not in word_freq:
            word_freq[word] = 1
        else:
            word_freq[word] +=1
        
        if word not in word_dict:
            word_dict[word] = index
            index +=1
    pairs = generate_word_pairs(clean_words,5)
    all_word_pairs.extend(pairs)
index_dict = {}
for word in word_dict:
    index_dict[word_dict[word]] = word 


In [4]:
c=Counter(word_freq)
word_freq = c.most_common()

In [5]:
vocabulary_size = len(word_dict)
default  = torch.zeros(vocabulary_size).float()
def get_input_layer(word_index):
    x = default.clone()
    x[word_index] = 1.0
    return x

input_array = []
output_array = []
for pair in all_word_pairs:
    input_array.append(get_input_layer(pair[0]))
    output_array.append(pair[1])


KeyboardInterrupt: 

In [6]:
from tqdm import tqdm
data_length = len(input_array)

dimension = 10
W1 = Variable(torch.randn(dimension, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, dimension).float(), requires_grad=True)
learning_rate = 0.02
    
one_chunk = data_length//100

for epoch in range(13):
    loss_sum = 0
    for i in tqdm(range(data_length), desc = 'Processing epoch: '+str(epoch)):
        x = Variable(input_array[i]).float()
        y = Variable(torch.from_numpy(np.array([output_array[i]])).long())


        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
        log_softmax = F.log_softmax(z2, dim=0)
        loss = F.nll_loss(log_softmax.view(1,-1), y)
    #     print(loss.data.item())
        loss_sum += loss.data.item()
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
    print(loss_sum/data_length)
print(W1)


Processing epoch: 0: 100%|█████████| 2861124/2861124 [1:25:15<00:00, 559.35it/s]


8.222001760141342


Processing epoch: 1: 100%|█████████| 2861124/2861124 [1:12:00<00:00, 662.29it/s]


7.5165397971763515


Processing epoch: 2: 100%|█████████| 2861124/2861124 [1:30:55<00:00, 524.49it/s]


7.388479605108581


Processing epoch: 3: 100%|█████████| 2861124/2861124 [1:51:30<00:00, 427.67it/s]


7.312445376945671


Processing epoch: 4: 100%|█████████| 2861124/2861124 [1:30:35<00:00, 526.37it/s]


7.260239420898745


Processing epoch: 5: 100%|█████████| 2861124/2861124 [1:07:50<00:00, 702.93it/s]


7.22188111991355


Processing epoch: 6: 100%|█████████| 2861124/2861124 [1:12:04<00:00, 661.67it/s]


7.192294333783426


Processing epoch: 7: 100%|█████████| 2861124/2861124 [1:37:21<00:00, 489.76it/s]


7.1686314587735245


Processing epoch: 8: 100%|█████████| 2861124/2861124 [1:22:21<00:00, 579.01it/s]


7.14920322088424


Processing epoch: 9: 100%|█████████| 2861124/2861124 [1:46:08<00:00, 449.26it/s]


7.132946383747314


Processing epoch: 10: 100%|████████| 2861124/2861124 [1:10:23<00:00, 677.39it/s]


7.119139361590082


Processing epoch: 11: 100%|████████| 2861124/2861124 [1:35:57<00:00, 496.95it/s]


7.107270161331997


Processing epoch: 12: 100%|████████| 2861124/2861124 [1:20:52<00:00, 589.67it/s]

7.0969633216696995
tensor([[ 0.2975, -0.2267,  0.4397,  ...,  0.4035, -0.7002, -0.0247],
        [-0.3683, -0.3885,  0.1371,  ..., -0.3022, -1.6193,  0.9082],
        [ 0.5209, -0.0021, -0.4142,  ..., -0.5322, -0.5676, -0.9239],
        ...,
        [-0.6067, -0.4563, -1.0164,  ..., -0.1613, -0.4850, -0.5942],
        [ 1.0261,  0.2941,  0.5465,  ..., -1.2781,  2.0119,  0.4807],
        [ 0.3779,  0.1140, -0.5060,  ...,  1.9119,  0.7990,  0.5038]],
       requires_grad=True)





In [7]:
torch.save(W1, 'w1.pt')
torch.save(W2, 'w2.pt')

In [6]:
W1 = torch.load('w1.pt')
W2 = torch.load('w2.pt')

tensor([1., 0., 0.,  ..., 0., 0., 0.])

In [8]:


def get_similarity_score(word1, word2):
    i1 = word_dict[word1]
    i2 = word_dict[word2]
    
    vec1 = word_vector_list[i1]
    vec2 = word_vector_list[i2]
    
    sum = 0
    for i in range(5):
        sum += (vec1[i] - vec2[i]) ** 2
        
    return sum
        
    
    
def torch_to_list(W):
    return_list = list()
    dim_list = W.tolist()
    
    for j in range(len(word_dict)):
        word_list = list()
        for i in range(5):
            word_list.append(dim_list[i][j])
            
        return_list.append(word_list)
        
    return return_list


def calculate_pairs_sim(word_dict):
    pair_sim_list = list()
    for word1 in word_dict:
        for word2 in word_dict:
            if (word1 == word2):
                continue
            temp_tuple = (word1[0] + " & " + word2[0], get_similarity_score(word1[0], word2[0]))
            pair_sim_list.append(temp_tuple)
    
    return sorted(pair_sim_list, key=lambda x: x[1], reverse=False)
     
    
    
word_vector_list = (torch_to_list(W1))

            
most_similar_word_pair = calculate_pairs_sim(word_freq[:1000])
print(most_similar_word_pair[:100])



            


[('goes & difference', 0.004742417475597505), ('difference & goes', 0.004742417475597505), ('start & customer', 0.009076565474090359), ('customer & start', 0.009076565474090359), ('date & l', 0.010038878181638344), ('l & date', 0.010038878181638344), ('days & within', 0.0109128325244673), ('within & days', 0.0109128325244673), ('ah & gave', 0.011112362147402553), ('gave & ah', 0.011112362147402553), ('within & receive', 0.011531409256095762), ('receive & within', 0.011531409256095762), ('bad & decided', 0.01299104835599274), ('decided & bad', 0.01299104835599274), ('since & worked', 0.013050215747005406), ('worked & since', 0.013050215747005406), ('found & key', 0.013273393608428023), ('key & found', 0.013273393608428023), ('cup & warm', 0.013475382132097424), ('warm & cup', 0.013475382132097424), ('everyone & fact', 0.014992380073078253), ('fact & everyone', 0.014992380073078253), ('nothing & bit', 0.016134372239288656), ('bit & nothing', 0.016134372239288656), ('stay & suggest', 0.01

In [38]:
[char if char not in punc else "v" for char in "abc"]

['a', 'b', 'c']

In [40]:
import nltk

In [42]:
nltk.download('words')

[nltk_data] Downloading package words to /Users/yiwenluo/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [44]:
nltk.wordpunct_tokenize("So Dick , how about getting some coffee for tonight ? ")

['So',
 'Dick',
 ',',
 'how',
 'about',
 'getting',
 'some',
 'coffee',
 'for',
 'tonight',
 '?']

In [45]:
set(nltk.corpus.words.words())

{'commensality',
 'Chaenomeles',
 'hunchback',
 'osteogenous',
 'successorship',
 'tricipital',
 'metahydroxide',
 'fervid',
 'triumphator',
 'Tartarized',
 'hugely',
 'lacker',
 'paunchful',
 'Unionidae',
 'reforestization',
 'sulforicinoleate',
 'practicalism',
 'prase',
 'Covarecas',
 'stertorous',
 'baryton',
 'Thesmophoria',
 'scribbleable',
 'headmistressship',
 'esplees',
 'definition',
 'emergent',
 'inflexibleness',
 'overentreat',
 'retract',
 'unlocking',
 'underside',
 'chemitypy',
 'topographically',
 'untailorlike',
 'circumagitate',
 'ureal',
 'masterdom',
 'dragsman',
 'subsemitone',
 'demisecond',
 'Lophopoda',
 'neurilema',
 'merogenic',
 'nondeliberate',
 'monoscope',
 'repreparation',
 'unshackled',
 'uroschesis',
 'nontransgression',
 'unprincipledly',
 'lactarene',
 'unmorality',
 'bobjerom',
 'cosmetical',
 'ignipuncture',
 'personalistic',
 'nondisposal',
 'preinform',
 'Dicksonia',
 'autohypnotization',
 'displayable',
 'transferrer',
 'chepster',
 'insertive',

In [46]:
# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yiwenluo/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def predict_context_word(word):
    input_index = word_dict[word]
    
    ## prediction
    predicted_input = get_input_layer(input_index)

    ## Using W1 * W2
    hidden_output = torch.matmul(W1, predicted_input)
    predicted_output = torch.matmul(W2, hidden_output)
    softmax_func = torch.nn.Softmax(dim=0)
    softmax_output = softmax_func(predicted_output)

    ## Transfer to real word
    output_list = softmax_output.tolist()
    for i in range(len(output_list)):
        output_list[i] = (index_dict[i], output_list[i])

    sorted_output = sorted(output_list, key=lambda x: x[1], reverse=True)
    
    return sorted_output[:5]


print(predict_context_word('say'))
