In [None]:
import torch
import torch.nn as nn
#import torch.nn.functional as F
import torch.optim as optim
#import torchvision.models as models
#from torch.profiler import profile, record_function, ProfilerActivity
#import torch.autograd.profiler as profiler

import time
import math
import random

import numpy as np
from tqdm import tqdm # adding a process bar

# check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("GPU")
else:
    device = torch.device("cpu")
    print("CPU")

torch.manual_seed(1)

CPU


<torch._C.Generator at 0x7f4b72944730>

In [None]:
WINDOW_SIZE = 2

# subject to later update
EMBEDDING_DIM = 100 

# negative-to-positive ratio
k = 4 

# constant for subsampling
t = 1e-5

# generate sentence heads and endings according to window size
sent_head = [f'*d{i}*' for i in range(1, WINDOW_SIZE+1)]
sent_ending = [f'*f{i}'for i in range(1, WINDOW_SIZE+1)]

# initialize vocab set
vocab = set(sent_head[0:WINDOW_SIZE] + sent_ending[0:WINDOW_SIZE])

In [None]:
def create_examples(tok_sents, w2i, i2w, word2nbocc):

    pos_examples = []

    # neg_examples = []
    
    ls_neg_example = [] # list of array to be concatenated

    ls_all_tokens2i = [w2i[tok] for tok in vocab]
    set_ls_all_tokens2i = set(ls_all_tokens2i)

    # consumed time for each step 
    t_pos = 0
    t_neg = 0
    t_remove_tokens = 0
    t_unigram = 0
    t_neg_array = 0
    t_neg_append = 0

    # sum of all occurences
    Z = sum(word2nbocc.values())

    # probability of each word unigram: method 2
    p_dict = {key: (val/Z)**0.75 for key, val in word2nbocc.items()}

    processed_token = 0

    for sent in tok_sents:
        sent2i = [w2i[tok] for tok in sent]

        #print(len(sent2i))

        # create positive and negative examples around each word of a sentence
        for i in range(WINDOW_SIZE, len(sent2i)-WINDOW_SIZE):
            processed_token +=1

            for j in range(1, WINDOW_SIZE+1):
                t_create_start = time.time()

                target = sent2i[i]

                t_create_pos_start = time.time()

                # positive examples
                pos_examples.append((target, sent2i[i-j], 1))
                pos_examples.append((target, sent2i[i+j], 1))


                t_create_neg_start = time.time()
                t_pos += (t_create_neg_start - t_create_pos_start)

                # negative examples

                t_remove_token_start = time.time()

                ## remove tokens in current sentence
                ls_complementary_tokens = list(set_ls_all_tokens2i - set(sent2i))

                ## unigram distribution according to Word2Vec
                t_unigram_start = time.time()

                #method 1 (works alone without the above line that generates p_dict):
                #p = np.array([(word2nbocc[i2w[token]]/Z)**0.75 for token in ls_complementary_tokens])

                #method 2:
                p = np.array([p_dict[i2w[token]] for token in ls_complementary_tokens])
                
                # normalize probabilities to make sum = 1
                p = p / p.sum()

                t_neg_array_start = time.time()
                # create pool with 2k negative words
                neg_words = np.random.choice(ls_complementary_tokens, size=2*k, p=p)
                # neg_example = [(target, neg, 0) for neg in neg_words]
                # neg_examples += neg_example

                t_neg_append_start = time.time()
                ls_neg_example.append(
                    np.array([(target, neg, 0) for neg in neg_words])
                )

                t_create_neg_end = time.time()
                
                
                # calculation of consumed time for each step 
                t_neg += (t_create_neg_end - t_create_neg_start)
                t_remove_tokens += (t_unigram_start - t_remove_token_start)
                t_unigram += (t_neg_array_start - t_unigram_start)
                t_neg_array += (t_neg_append_start- t_neg_array_start)
                t_neg_append += (t_create_neg_end - t_neg_append_start)

    print(f"Pos ex creation finished in {t_pos} second(s)")
    print(f"Neg ex creation finished in {t_neg} second(s)")

    print()
    print(f"Remove tokens finished in {t_remove_tokens} second(s)")
    print(f"Unigram finished in {t_unigram} second(s)")
    print(f"Array finished in {t_neg_array} second(s)")
    print(f"Append finished in {t_neg_append} second(s)")

    print()
    print(f"Processed {processed_token} tokens.")


    pos_examples = np.array(pos_examples)
    #print(pos_examples)

    #print()
    #print(ls_neg_example)

    # concatenate neg examples so that it has the same shape as pos examples
    neg_examples = np.vstack(ls_neg_example)
    #print(neg_examples)

    return pos_examples, neg_examples

In [None]:
def subsampling(tok_sents, word2nbocc):

    subsampled_sents = []
    sum_occ = sum(word2nbocc.values())
    nb_subsamples = 0

    t_sub_start = time.time()

    for tok_sent in tok_sents:
        sample = []

        # ignore tokens that have a greater probability than randomly chosen one
        # probability is calculated based on the equation proposed by word2vec
        for i in range(WINDOW_SIZE, len(tok_sent)-WINDOW_SIZE):
            rel_freq = word2nbocc[tok_sent[i]]/sum_occ
            prob = 1 - math.sqrt(t / rel_freq)
            if (random.random() > prob): 
                sample.append(tok_sent[i])

        # add sentence head and ending markers
        nb_subsamples += len(sample)
        sample = sent_head + sample + sent_ending
        subsampled_sents.append(sample)

    t_sub_end = time.time()
    print(f"Subsampling finished in {t_sub_end - t_sub_start} second(s)")

    return subsampled_sents, nb_subsamples

## Lecture du corpus

In [None]:
i2w = []
w2i = {}
word2nbocc = {}
tok_sents = []
subsampled_sents = []
 
t0 = time.time()
data_file_path = "train_3000.txt"
#data_file_path = "EP.tcs.melt.utf8.split-aa"
    

nb_tokens = 0

t_data = 0
t_token = 0
t_vocab = 0
t_dict = 0


with open(data_file_path,'r',encoding='utf-8') as f:
    for line in f:

        t_token_start = time.time()

        tok_sent = []
        line = line.split()

        for elem in line:
            s = elem.split("/",1)[0]
            if len(s) > 0:  # avoid empty string 
                tok_sent.append(s)

        tok_sents.append(tok_sent)
        nb_tokens += len(tok_sent)

        t_vocab_start = time.time()
        t_token += t_vocab_start - t_token_start

        vocab = vocab.union(tok_sent)

        t_dict_start = time.time()
        t_vocab += t_dict_start - t_vocab_start

        # generate occurrence dictionary
        for tok in tok_sent:
            if tok not in word2nbocc.keys():
                word2nbocc[tok] = 1
            else:
                word2nbocc[tok] += 1

        #t_dict_end = time.time()
        #t_dict += t_dict_end - t_dict_start

#print(f"Tokenization finished in {t_token} second(s)")
#print(f"Vocab finished in {t_vocab} second(s)")
#print(f"Dictionary finished in {t_dict} second(s)")

vocab_size = len(vocab)
#EMBEDDING_DIM = len(vocab)   
i2w = list(vocab)
w2i = {w:i for i,w in enumerate(i2w)}

#subsampling
subsampled_sents, nb_subsamples = subsampling(tok_sents, word2nbocc)

#generate pos and neg examples
pos_examples, neg_examples = create_examples(subsampled_sents, w2i, i2w,  word2nbocc)

print()
print(f"{data_file_path} has {nb_tokens} tokens, {nb_subsamples} subsampled tokens.")
print(f"{data_file_path} has {len(pos_examples)} positive examples and {len(neg_examples)} negative examples.")
t1 = time.time()

print()
print(f"Preprocessing finished in {t1-t0} second(s)")

Subsampling finished in 0.026306629180908203 second(s)
Pos ex creation finished in 0.020664691925048828 second(s)
Neg ex creation finished in 73.98966479301453 second(s)

Remove tokens finished in 5.711493015289307 second(s)
Unigram finished in 51.483500719070435 second(s)
Array finished in 16.217626333236694 second(s)
Append finished in 0.5727570056915283 second(s)

Processed 8619 tokens.

train_3000.txt has 53862 tokens, 8619 subsampled tokens.
train_3000.txt has 34476 positive examples and 137904 negative examples.

Preprocessing finished in 74.91899228096008 second(s)


In [None]:
class SGNS(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SGNS, self).__init__()
        self.net = nn.Sequential(
            nn.Embedding(vocab_size, EMBEDDING_DIM),            
            nn.Embedding(vocab_size, EMBEDDING_DIM)            
        )        
    
    def forward(self, target_words, inputs_context):
        # mat_word = self.net[0](inputs_word)
        # mat_context = self.net[1](inputs_context)
        # pred = torch.bmm(mat_word, mat_context) 
        #score = F.sigmoid(pred, dim=1)
        # score = torch.sigmoid(pred) #

        target_embeddings = self.net[0](target_words)       # [batch_size, emb_size]
        context_embeddings = self.net[1](inputs_context) # [batch_size, emb_size]

        mul = target_embeddings * context_embeddings   # [batch_size, emb_size]
        scores = torch.sum(mul, dim = 1)   # batch_size

        return scores
        

losses = []
#loss_function = nn.NLLLoss() # calc and grab the loss value

loss_function = nn.BCEWithLogitsLoss(reduction = 'sum')

model = SGNS(len(vocab), EMBEDDING_DIM).to(device) # add the selected device to the model

"""step lr"""
#lrn_rate = 100 # start very big
#optimizer = optim.SGD(model.parameters(), lr=lrn_rate)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99) # gamma=0.99

"""ASGD lr"""
# optimizer = torch.optim.ASGD(model.parameters(), lr=lrn_rate, lambd=0.0001, alpha=0.75, t0=1000000.0, weight_decay=0) # mplements Averaged Stochastic Gradient Descent.

"""lambda lr"""
# optimizer = torch.optim.SGD(model.parameters(), lr=100)
# lambda1 = lambda epoch: 0.65 ** epoch
# scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1)

"""Multiplicative LR"""
#optimizer = torch.optim.SGD(model.parameters(), lr=100)
#lmbda = lambda epoch: 0.65 ** epoch
#scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lmbda)

"""Cyclic LR"""
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.001 , max_lr=0.10)

## Training

In [None]:
from random import shuffle

#def train(pos_examples, neg_examples):

BATCH_SIZE =8
NB_EPOCHS=5 

print(f"Nb of examples = {len(pos_examples) + len(neg_examples)}")

# concatenate all examples
examples = np.vstack([pos_examples, neg_examples])

losses = []
loss = 0
# creates once at the beginning of training
# scaler = torch.cuda.amp.GradScaler()

# time consumed for each step 
t_score = 0
t_loss = 0
t_backward = 0
t_totalloss = 0

t2 = time.time()

# for epoch in range(NB_EPOCHS):
for i in tqdm(range(NB_EPOCHS)): # processing bar applied here
    
    shuffle(examples)
    total_loss = 0
    batch_head = 0
    
    # forward propagation with mini-batch 

    while batch_head < len(examples):
        t_batch_start = time.time()

        with profiler.record_function("load input batch"):
          batch = examples[batch_head:batch_head + BATCH_SIZE]        
          # batch_gpu = [torch.as_tensor(t, device=torch.device('cuda')) for t in batch]        
          (targets, contexts, labels) = zip(*batch)

          #torch.cuda.empty_cache()
          #torch.cuda.synchronize()
          
          targets = torch.tensor(targets)
          contexts = torch.tensor(contexts)
          labels = torch.tensor(labels) # TAnh ajoute
          #targets = torch.tensor(targets, device=torch.device('cuda'))
          #contexts = torch.tensor(contexts, device=torch.device('cuda'))
          #labels = torch.tensor(labels, device=torch.device('cuda'))

        with profiler.record_function("zero out"):
          for param in model.parameters(): # to zero out gradients
              param.grad = None
        
        t_score_start = time.time()
        with profiler.record_function("forward - score"):
          # Run the forward propagation and getting scores for each mini-batch 
          score = model(targets, contexts)
        
        t_score_end = time.time()
        #print(f"Score calculation finished in {t_score_end - t_score_start} second(s)")
            
        # Compute loss function
        t_loss_start = time.time()

        with profiler.record_function("forward - loss"):
          loss = loss_function(score, labels.float())
        # with torch.cuda.amp.autocast():
        #    loss = loss_function(score, batch_contexts)
        
        t_loss_end = time.time()
        #print(f"Loss calculation finished in {t_loss_end - t_loss_start} second(s)")

        # Do backward propagation and update gradients
        t_backward_start = time.time()
        with profiler.record_function("backward"):
          loss.backward() # apply this loss backwards through the network's parameters

        # Scales the loss, and calls backwards()
        # to create scaled gradients
        # scaler.scale(loss).backward()
              
        with profiler.record_function("backward - optimizer"):
          optimizer.step() # attempt to optimize weights to account for loss/gradients
        
        # Unscales gradients and calls
        # or skips optimizer().step()
        # scaler.step(optimizer)

        # Updates the scale for next iteration
        # scaler.update()

        with profiler.record_function("backward - scheduler"):
          scheduler.step() # adjust LR
        # current_lr = scheduler.get_last_lr()
        t_backward_end = time.time()

        print(f"Back propagation finished in {t_backward_end - t_backward_start} second(s)")

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        
        # Calculate total loss for each epoch
        t_totalloss_start = time.time()
        
        with profiler.record_function("total loss"):
          total_loss += loss.item()          

        t_totalloss_end = time.time()    

        print(f"Total loss calculation finished in {t_totalloss_end - t_totalloss_start} second(s)")

        batch_head += 1
        #print(f"batch no.{i} : loss = {loss.item()}")
        t_batch_end = time.time()
        #print(f"Batch {i} finished in {t_batch_end - t_batch_start} second(s)")

    # print('epoch : {} \tLoss : {}'.format(epoch, total_loss))
    losses.append(total_loss)     
    
    #print(f"Score calculation finished in {t_batch_end - t_batch_start} second(s)")

print()
print(losses)  # The loss decreased every iteration over the training data!

t3 = time.time()

print(f"Training of {data_file_path} finished in {t3-t2} second(s)")

Nb of examples = 172380


  0%|          | 0/5 [00:00<?, ?it/s]

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Back propagation finished in 0.008534908294677734 second(s)
Total loss calculation finished in 5.245208740234375e-05 second(s)
Back propagation finished in 0.0035097599029541016 second(s)
Total loss calculation finished in 0.00026345252990722656 second(s)
Back propagation finished in 0.0035326480865478516 second(s)
Total loss calculation finished in 0.0007059574127197266 second(s)
Back propagation finished in 0.004691600799560547 second(s)
Total loss calculation finished in 5.555152893066406e-05 second(s)
Back propagation finished in 0.005053997039794922 second(s)
Total loss calculation finished in 5.245208740234375e-05 second(s)
Back propagation finished in 0.0037763118743896484 second(s)
Total loss calculation finished in 0.0003523826599121094 second(s)
Back propagation finished in 0.003740549087524414 second(s)
Total loss calculation finished in 0.0006878376007080078 second(s)
Back propagatio

 20%|██        | 1/5 [15:37<1:02:28, 937.00s/it]

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Back propagation finished in 0.0035657882690429688 second(s)
Total loss calculation finished in 6.318092346191406e-05 second(s)
Back propagation finished in 0.003469228744506836 second(s)
Total loss calculation finished in 0.0006377696990966797 second(s)
Back propagation finished in 0.0034058094024658203 second(s)
Total loss calculation finished in 0.0005652904510498047 second(s)
Back propagation finished in 0.0035829544067382812 second(s)
Total loss calculation finished in 0.0008268356323242188 second(s)
Back propagation finished in 0.0034830570220947266 second(s)
Total loss calculation finished in 0.0006506443023681641 second(s)
Back propagation finished in 0.0034716129302978516 second(s)
Total loss calculation finished in 0.0006806850433349609 second(s)
Back propagation finished in 0.003448963165283203 second(s)
Total loss calculation finished in 0.0006117820739746094 second(s)
Back propagati

 40%|████      | 2/5 [32:31<48:01, 960.33s/it]  

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Back propagation finished in 0.003782033920288086 second(s)
Total loss calculation finished in 0.0007257461547851562 second(s)
Back propagation finished in 0.004225015640258789 second(s)
Total loss calculation finished in 0.0008375644683837891 second(s)
Back propagation finished in 0.0038907527923583984 second(s)
Total loss calculation finished in 6.4849853515625e-05 second(s)
Back propagation finished in 0.003802061080932617 second(s)
Total loss calculation finished in 0.0007297992706298828 second(s)
Back propagation finished in 0.0037603378295898438 second(s)
Total loss calculation finished in 0.0007328987121582031 second(s)
Back propagation finished in 0.0036773681640625 second(s)
Total loss calculation finished in 0.0006823539733886719 second(s)
Back propagation finished in 0.003799915313720703 second(s)
Total loss calculation finished in 0.0004570484161376953 second(s)
Back propagation fini

 60%|██████    | 3/5 [49:44<32:44, 982.12s/it]

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Back propagation finished in 0.004935026168823242 second(s)
Total loss calculation finished in 4.4345855712890625e-05 second(s)
Back propagation finished in 0.00513458251953125 second(s)
Total loss calculation finished in 4.482269287109375e-05 second(s)
Back propagation finished in 0.004555225372314453 second(s)
Total loss calculation finished in 0.0006518363952636719 second(s)
Back propagation finished in 0.005011320114135742 second(s)
Total loss calculation finished in 5.650520324707031e-05 second(s)
Back propagation finished in 0.004253864288330078 second(s)
Total loss calculation finished in 3.5762786865234375e-05 second(s)
Back propagation finished in 0.004143476486206055 second(s)
Total loss calculation finished in 3.5762786865234375e-05 second(s)
Back propagation finished in 0.004039287567138672 second(s)
Total loss calculation finished in 4.0531158447265625e-05 second(s)
Back propagation

 80%|████████  | 4/5 [1:06:55<16:36, 996.84s/it]

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Back propagation finished in 0.004009246826171875 second(s)
Total loss calculation finished in 0.0006039142608642578 second(s)
Back propagation finished in 0.0035619735717773438 second(s)
Total loss calculation finished in 0.0006663799285888672 second(s)
Back propagation finished in 0.0035400390625 second(s)
Total loss calculation finished in 0.0009443759918212891 second(s)
Back propagation finished in 0.0035331249237060547 second(s)
Total loss calculation finished in 0.0006668567657470703 second(s)
Back propagation finished in 0.0035486221313476562 second(s)
Total loss calculation finished in 0.0006268024444580078 second(s)
Back propagation finished in 0.003509998321533203 second(s)
Total loss calculation finished in 4.172325134277344e-05 second(s)
Back propagation finished in 0.0035550594329833984 second(s)
Total loss calculation finished in 3.9577484130859375e-05 second(s)
Back propagation fi

100%|██████████| 5/5 [1:24:10<00:00, 1010.02s/it]

Back propagation finished in 0.004791975021362305 second(s)
Total loss calculation finished in 5.340576171875e-05 second(s)
Back propagation finished in 0.004561901092529297 second(s)
Total loss calculation finished in 6.127357482910156e-05 second(s)
Back propagation finished in 0.004107236862182617 second(s)
Total loss calculation finished in 4.506111145019531e-05 second(s)
Back propagation finished in 0.004109621047973633 second(s)
Total loss calculation finished in 3.266334533691406e-05 second(s)
Back propagation finished in 0.00512385368347168 second(s)
Total loss calculation finished in 3.790855407714844e-05 second(s)
Back propagation finished in 0.004289150238037109 second(s)
Total loss calculation finished in 3.8623809814453125e-05 second(s)
Back propagation finished in 0.005165815353393555 second(s)
Total loss calculation finished in 3.719329833984375e-05 second(s)
Back propagation finished in 0.004687070846557617 second(s)
Total loss calculation finished in 0.00040006637573242




In [None]:
# learning curve (a posteriori)
import matplotlib.pyplot as plt

x = np.arange(0,NB_EPOCHS,1)

plt.xticks(np.arange(0,NB_EPOCHS+1,1)) # largir la distance de l'abscisse X
plt.plot(x, losses)

# see visdom module to vizualize learning curve during training

In [8]:
a = ['f', 'f', 'g']

neg_examples = []

for neg in a:
  neg_examples.append((neg, neg, 0)) 


In [9]:
neg_examples

[('f', 'f', 0), ('f', 'f', 0), ('g', 'g', 0)]