Environnement
=============

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
import os
print(os.listdir("../input"))

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field,ReversibleField,TabularDataset,Iterator,BucketIterator



['glove-global-vectors-for-word-representation', 'sentence-relatedness']


Lecture des données
===================

In [2]:
def tokenize(text):
    return text.split()

def proc_float(value):
    return float(value)

def proc_int(value):
    return int(value)

# lowercase the corpus
TEXT      = Field(sequential=True, lower=False, tokenize=tokenize) #might alternatively specify cuda data types to get the dataset to live permanently on the GPU
FLOAT     = Field(sequential=False, use_vocab=False,dtype=torch.float,preprocessing=proc_float) 
INTEGER   = Field(sequential=False, use_vocab=False,preprocessing=proc_int)

df         = TabularDataset("../input/sentence-relatedness/SICK_train_logistic.txt","tsv",skip_header=True,\
                            fields=[('idx',INTEGER),('sentA',TEXT),('sentB',TEXT),('Relatedness',FLOAT)])

df_train,df_dev  = df.split(split_ratio=0.8)
TEXT.build_vocab(df_train)

#Prints out the first few lines of the train set
for elt in df_train[:5]:
    print(elt.idx,' '.join(elt.sentA),'||',' '.join(elt.sentB),elt.Relatedness)
print()

#load test set
df_test = TabularDataset("../input/sentence-relatedness/SICK_test.txt","tsv",skip_header=True,\
                            fields=[('idx',INTEGER),('sentA',TEXT),('sentB',TEXT)])

#Prints out the first few lines of the test set
for elt in df_test[:5]:
    print(elt.idx,' '.join(elt.sentA),'||',' '.join(elt.sentB))
print()
print(df_test[0].idx)

1112 A man is opening a package that contains headphones || There is no man singing and playing the guitar 0.24
8601 Little boys are playing in a water fountain in front of lots of people || Children are playing in a fountain that is spraying water from the ground 0.74
588 The children are playing in front of a large door || A group of boys are quiet in front of a large door made of wood 0.6799999999999999
5012 A man is cutting an onion || An onion is being sliced by a woman 0.76
9051 A pair of kids are sticking out blue and green colored tongues || Two kids are sticking out blue and green colored tongues 0.96

6 There is no boy playing outdoors and there is no man smiling || A group of kids is playing in a yard and an old man is standing in the background
7 A group of boys in a yard is playing and a man is standing in the background || The young boys are playing outdoors and the man is smiling nearby
8 A group of children is playing in the house and there is no man standing in the bac

Classification
==============

In [3]:
class ParaphraseClassifier(nn.Module):
    
    def __init__(self,hidden_dim,embedding_dim):
       
        super(ParaphraseClassifier, self).__init__()
       
        self.hidden_dim    = hidden_dim
        self.embedding_dim = embedding_dim
        self.embedding     = nn.Embedding(len(TEXT.vocab), embedding_dim)
        self.lstm          = nn.LSTM(embedding_dim, hidden_dim, num_layers=1,bidirectional=False)
        self.Wadd          = nn.Linear(hidden_dim,hidden_dim)   
        self.Wtimes        = nn.Linear(hidden_dim,hidden_dim)
        self.Wout          = nn.Linear(hidden_dim,1)
        
        
    def use_glove_embeddings(self):
        
        # load Glove
        embeddings_dict = {}
        with open("../input/glove-global-vectors-for-word-representation/glove.6B.50d.txt", 'r') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], "float32")
                embeddings_dict[word] = vector
        
        # adapt to corpus
        weights_matrix = []
        words_found = 0

        vocab = [TEXT.vocab.itos[i] for i in range(len(TEXT.vocab))]

        for i, word in enumerate(vocab):
            try: 
                weights_matrix.append(embeddings_dict[word])
                words_found += 1
            # pour les rares mots qui ne sont pas dans Glove, on laisse un vecteur au hasard
            except KeyError:
                weights_matrix.append(np.random.normal(scale=0.6, size=(50, )))

        weights_matrix = torch.FloatTensor(weights_matrix)
                
        # create layer
        num_embeddings, embedding_dim = weights_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(weights_matrix)
#         self.embedding.weight.requires_grad = False


    def forward(self,xinputA,xinputB):
        """
        Args:
            xinputA is a sequence of word indexes
            xinputB is a sequence of word indexes
        The forward method also works for batched input.       
        """
        ##details for dimensionalities
        #embeddings
        #  input : batch_size x seq_length
        #  output: batch-size x seq_length x embedding_dimension
        #lstm
        #  input : seq_length x batch_size x embedding_size
        #  output: seq_length x batch_size x hidden_size  (for the sequence)
        #  output: batch_size x hidden_size (for the last hidden/cell state)
        xembeddedA                       = self.embedding(xinputA)                                                #catches embedding vectors
        lstm_outA, (hiddenA,cellA)       = self.lstm(xembeddedA.view(len(xinputA), -1, self.embedding_dim), None) #-1 is a wildcard (here we let pytorch guess batch size)
       
        xembeddedB                       = self.embedding(xinputB)                                                #catches embedding vectors
        lstm_outB, (hiddenB,cellB)       = self.lstm(xembeddedB.view(len(xinputB), -1, self.embedding_dim), None)
       
        #hiddenA = hiddenA.view(-1,self.hidden_dim * 2)
        #hiddenB = hiddenB.view(-1,self.hidden_dim * 2)       
        #merge sentence representations
        hiddenT = hiddenA * hiddenB
        hiddenD = torch.abs(hiddenA - hiddenB)
        hidden  = torch.tanh(self.Wtimes(hiddenT) + self.Wadd(hiddenD))
        return torch.sigmoid(self.Wout(hidden))
    
    
    def train(self,train_set,dev_set,epochs,learning_rate=0.001):
        
        loss_func  = nn.BCELoss() 
#         optimizer  = optim.SGD(self.parameters(), lr=learning_rate)
        optimizer  = optim.Adam(self.parameters(), lr=learning_rate)
        
        # la vitesse d'entraînement est décuplée en passant de batchs de 1 à des batchs de 64
        train_iterator   = BucketIterator(train_set, batch_size=64, device=-1, sort_key=lambda x: len(x.sentA), sort=False, sort_within_batch=False, repeat=False)
        
        t=time.time()
        for e in range(epochs):
            global_logloss = 0
            nb_batch = 0
            for i, batch in enumerate(train_iterator):
                nb_batch += 1
                xvecA,xvecB,yRelness = batch.sentA,batch.sentB,batch.Relatedness
                self.zero_grad()
                prob            = self.forward(xvecA,xvecB).squeeze()
                loss            = loss_func(prob,yRelness)
                loss.backward()
                optimizer.step()
                global_logloss += loss.item()
            
            average_loss = global_logloss/nb_batch
            print("Epoch %d, mean cross entropy = %f"%(e+1,average_loss))
                
        print(epochs,"epochs,",time.time()-t,'s')
            
            
    def run_test(self,test_set):

        # make predictions
        test_iterator   = Iterator(test_set, batch_size=1, device=-1, sort=False, sort_within_batch=False, repeat=False, shuffle=False)
        
        predictions = list()
        idx = list()
        for elt in test_iterator:
            xvecA,xvecB = elt.sentA,elt.sentB
            relness     = self.forward(xvecA,xvecB).squeeze()
            score       = relness.item() *4 +1
            predictions.append(score)
            idx.append(elt.idx.item())

        to_df = {'pairID':idx,'Relatedness': predictions}
        df = pd.DataFrame(to_df)
        print(df)
        df.to_csv('results.csv',index=False)

In [4]:
pc = ParaphraseClassifier(150,50)
pc.use_glove_embeddings()

pc.train(df_train,df_dev,50)

pc.run_test(df_test)

# meilleur score :
# emb_sz=30, epoch=150, lr=0.001
# opt=Adam

# V8
# - batch dans run_test remis à 1
# - passage de Relatedness [0,1] à score [1,5]

Epoch 1, mean cross entropy = 0.616256
Epoch 2, mean cross entropy = 0.607720
Epoch 3, mean cross entropy = 0.607006
Epoch 4, mean cross entropy = 0.605743
Epoch 5, mean cross entropy = 0.605694
Epoch 6, mean cross entropy = 0.601661
Epoch 7, mean cross entropy = 0.583989
Epoch 8, mean cross entropy = 0.576904
Epoch 9, mean cross entropy = 0.572215
Epoch 10, mean cross entropy = 0.568487
Epoch 11, mean cross entropy = 0.564155
Epoch 12, mean cross entropy = 0.560519
Epoch 13, mean cross entropy = 0.557589
Epoch 14, mean cross entropy = 0.553696
Epoch 15, mean cross entropy = 0.551427
Epoch 16, mean cross entropy = 0.549519
Epoch 17, mean cross entropy = 0.546295
Epoch 18, mean cross entropy = 0.543770
Epoch 19, mean cross entropy = 0.541558
Epoch 20, mean cross entropy = 0.540912
Epoch 21, mean cross entropy = 0.538156
Epoch 22, mean cross entropy = 0.536383
Epoch 23, mean cross entropy = 0.534205
Epoch 24, mean cross entropy = 0.531019
Epoch 25, mean cross entropy = 0.530796
Epoch 26,