# PREPROCESSING

In [None]:
# processes a textfile according with skip-gram algorithm
# returns a list in the following format [[7,4],[42,44], ...]
from preprocessing import Preprocess
from utils import createWordPairs
import pickle

# Variables: 
#     threshold: how many emojis count as a sequence
#     window_size: determine how far to the left and right of center_word the skip-gram algo forms word_pairs
threshold = 2
window_size = 8

indexes = Preprocess('./data/initializationSet.txt', threshold)
trainingCorpus = Preprocess('./data/trainingSet.txt', threshold)
validationCorpus = Preprocess('./data/validationSet.txt', threshold)
trainingPairs = createWordPairs(indexes, trainingCorpus, window_size)
validationPairs = createWordPairs(indexes, validationCorpus, window_size)

indexes_out = open("./Preprocess_Files/indexes.pickle","wb")
trainingCorpus_out = open("./Preprocess_Files/trainingCorpus.pickle","wb")
validationCorpus_out = open("./Preprocess_Files/validationCorpus.pickle","wb")
trainingPairs_out = open("./Preprocess_Files/trainingPairs.pickle","wb")
validationPairs_out = open("./Preprocess_Files/validationPairs.pickle","wb")

pickle.dump(indexes, indexes_out)
pickle.dump(trainingCorpus, trainingCorpus_out)
pickle.dump(validationCorpus, validationCorpus_out)
pickle.dump(trainingPairs, trainingPairs_out)
pickle.dump(validationPairs, validationPairs_out)

indexes_out.close()
trainingCorpus_out.close()
validationCorpus_out.close()
trainingPairs_out.close()
validationPairs_out.close()

# TRAINING

In [2]:
# best trainable after extending the data_rate_limit..
# use terminal cmd w/ MAC: "jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000000"

import pickle
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader

from dataset import LoadedDataSet
from model import Word2Vec
from utils import EarlyStopping

# importing preprocessing_files
indexes_in = open("./Preprocess_Files/indexes.pickle","rb")
trainingCorpus_in = open("./Preprocess_Files/trainingCorpus.pickle","rb")
validationCorpus_in = open("./Preprocess_Files/validationCorpus.pickle","rb")
trainingPairs_in = open("./Preprocess_Files/trainingPairs.pickle","rb")
validationPairs_in = open("./Preprocess_Files/validationPairs.pickle","rb")

indexes = pickle.load(indexes_in)
trainingCorpus = pickle.load(trainingCorpus_in)
validationCorpus = pickle.load(validationCorpus_in)
trainingPairs = pickle.load(trainingPairs_in)
validationPairs = pickle.load(validationPairs_in)

indexes_in.close()
trainingCorpus_in.close()
validationCorpus_in.close()
trainingPairs_in.close()
validationPairs_in.close()

# HyperParams
dimensionSize = 300
num_epochs = 600
lr = 0.01
batchSize = 150
patience = 5
save_name = "noNumpy.w2v"
verbose = True

# pickle_in = open("dict.pickle","rb")
# example_dict = pickle.load(pickle_in)

trainingDataset = LoadedDataSet(trainingPairs)
trainingLoader = DataLoader(trainingDataset, batchSize, shuffle=True) 
validationDataset = LoadedDataSet(validationPairs)
validationLoader = DataLoader(validationDataset, batchSize, shuffle=True) 

model = Word2Vec(indexes.vocabulary_size, dimensionSize)
optimizer = torch.optim.Adam(model.parameters(), lr)
# Variables for Scheduler:
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(trainingLoader), eta_min=0, last_epoch=-1)

# initialize the early_stopping object
early_stopping = EarlyStopping(patience, verbose, save_name)
avg_train_losses = []
avg_valid_losses = []

for e in range(1, num_epochs + 1):
    train_losses = []
    valid_losses = []
#     for param_group in optimizer.param_groups:
#         print(param_group['lr'])    
#         ###################
#         # train the model #
#         ###################     
    for i, (data, target) in enumerate(trainingLoader):
        model.train()
        optimizer.zero_grad()
        loss = model.forward(data, target)       
        train_losses.append(loss.item())
        loss.backward()
        optimizer.step()     
  
        ######################    
        # validate the model #
        ######################
    for i, (data, target) in enumerate(validationLoader): 
        model.eval()
        with torch.no_grad():
            loss = model.forward(data, target)       
        valid_losses.append(loss.item())

    avg_train_losses.append(np.average(train_losses))
    avg_valid_losses.append(np.average(valid_losses))
    
#     scheduler.step()
    
###################################################    
#   visual Update regarding the current epoch   #
###################################################    
    epoch_len = len(str(num_epochs))
    print_msg = (f'[{e:>{epoch_len}}/{num_epochs:>{epoch_len}}] ' +
                 f'train_loss: {np.average(train_losses):.5f} ' +
                 f'valid_loss: {np.average(valid_losses):.5f}')
    print(print_msg)
    
        ##################
        # Early Stopping #
        ##################
    early_stopping(np.average(valid_losses), model)
    if early_stopping.early_stop:
        print("EARLY STOPPING!")
        break

[  1/600] train_loss: 6.50044 valid_loss: 6.03725
Validation loss decreased (inf --> 6.037249).  Saving model ...


KeyboardInterrupt: 

# Visualizing the Loss and the Early Stopping Checkpoint¶

In [None]:
import matplotlib.pyplot as plt

# visualize the loss as the network trained
fig = plt.figure(figsize=(10,8))
plt.plot(range(1,len(avg_train_losses)+1),avg_train_losses, label='Training Loss')
plt.plot(range(1,len(avg_valid_losses)+1),avg_valid_losses,label='Validation Loss')

# find position of lowest validation loss
minposs = avg_valid_losses.index(min(avg_valid_losses))+1 
plt.axvline(minposs, linestyle='--', color='r',label='Early Stopping Checkpoint')

plt.xlabel('epochs')
plt.ylabel('loss')
plt.ylim(3.5, 7) # consistent scale
plt.xlim(0, len(avg_train_losses)+1) # consistent scale
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
fig.savefig('./SOFTMAX_trained/plots/loss_plot.png', bbox_inches='tight')

# LOADING & EVALUATING TRAINED MODEL

In [None]:
# loads the weights of a saved model and calculates and prints the following metrics: SpearManRank, MSE, MAE 
# look @ the data with TensorBoardX "tensorboard --logdir runs"

import torch
import os
from evaluation import Metrics

loadedModel = torch.load(os.path.join("SOFTMAX_trained", "attempt#4.w2v"))
Metrics(loadedModel, indexes)

In [None]:
import torch
import os
loadedModel = torch.load(os.path.join("SOFTMAX_trained", "noNumpy.w2v"))


In [None]:
print(type(loadedModel.parameters()))

In [None]:
embedding.weight = torch.nn.Parameter(loadedModel.weight)


In [None]:
x = 0
for param in loadedModel.parameters():
    x = x + 1
    print(param.data)
print(x)

In [None]:

print(loadedModel.weight.data.cpu().numpy()[1][0])

In [None]:
model = loadedModel.weight.data.cpu().numpy()

In [None]:
#     .weight.data.cpu().numpy()
model[1][0]

In [None]:
model.shape

In [None]:
import matplotlib.pyplot as plt
imgplot = plt.imshow(model)

In [None]:
import matplotlib.pyplot as pp
ar = [1, 2, 3, 8, 4, 5]
pp.plot(model)
pp.show()