In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import torch
import numpy as np
import gensim.downloader
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import log_loss

In [2]:
device = torch.device('cuda')

In [3]:
wv = gensim.downloader.load('word2vec-google-news-300')

# Load data

In [4]:
data = pd.read_csv("Doyle_Christie_dataset/train.csv")
data = data.drop(columns=['Unnamed: 0',])
data['labels'] = data['author'].apply(lambda x: 0 if x == 'Doyle' else 1)

In [5]:
test_data = pd.read_csv("Doyle_Christie_dataset/test.csv")
test_data = test_data.drop(columns=['Unnamed: 0',])
test_data['labels'] = test_data['author'].apply(lambda x: 0 if x == 'Doyle' else 1)

# Functions

In [6]:
def make_pairs(data, labels, column='author'):
    doyle = data[data[column] == labels[0]]
    christie = data[data[column] == labels[1]]
    
    doyle_true = doyle.sample(frac = 0.5)
    doyle_false = doyle.drop(doyle_true.index)
    
    christie_true = christie.sample(frac = 0.5)
    christie_false = christie.drop(christie_true.index)
    
    doyle_left = doyle_true.sample(frac=0.5)
    doyle_right = doyle_true.drop(doyle_left.index)
    
    christie_left = christie_true.sample(frac=0.5)
    christie_right = christie_true.drop(christie_left.index)
    
    false_pair_data = pd.DataFrame({'sentence1':list(doyle_false['sentence']), 
                                'sentence2':list(christie_false['sentence']),
                               'labels':['0' for i in range(len(doyle_false))]})
    
    doyle_pair_data = pd.DataFrame({'sentence1':list(doyle_left['sentence']), 
                                'sentence2':list(doyle_right['sentence']),
                               'labels':['1' for i in range(len(doyle_right))]})
    
    christie_pair_data = pd.DataFrame({'sentence1':list(christie_left['sentence']), 
                                'sentence2':list(christie_right['sentence']),
                               'labels':['1' for i in range(len(christie_right))]})
    
    pairs = pd.concat([false_pair_data, doyle_pair_data, christie_pair_data], ignore_index=True)
    
    return pairs

In [7]:
# это код из статьи

def process_dataset(dataset, model, vocabulary, inverse_vocabulary):
    
    processed_dataset = []
    
    for index, row in tqdm(dataset.iterrows()):
        pair = []
        questions = [row['sentence1'], row['sentence2']]
        for question in questions:

            q2n = []  # q2n -> question numbers representation
            for word in question.lower().split():

                # Check for unwanted words
                if word not in model.key_to_index.keys():
                    continue

                if word not in vocabulary:
                    vocabulary[word] = len(inverse_vocabulary)
                    q2n.append(len(inverse_vocabulary))
                    inverse_vocabulary.append(word)
                else:
                    q2n.append(vocabulary[word])
            
            pair.append(torch.tensor(q2n))
        processed_dataset.append(pair)
                    
    return vocabulary, inverse_vocabulary, processed_dataset


def prepare_embeddings(model, train, valid, test):
    vocabulary = dict()
    inverse_vocabulary = ['<unk>']

    # Iterate over the questions only of both training and test datasets
    vocabulary, inverse_vocabulary, processed_train = process_dataset(train, model, vocabulary, inverse_vocabulary)
    vocabulary, inverse_vocabulary, processed_valid = process_dataset(valid, model, vocabulary, inverse_vocabulary)
    vocabulary, inverse_vocabulary, processed_test = process_dataset(test, model, vocabulary, inverse_vocabulary)

    embedding_dim = model.vector_size
    embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim)  # This will be the embedding matrix
    embeddings[0] = 0  # So that the padding will be ignored

    # Build the embedding matrix
    for word, index in vocabulary.items():
        if word in model.key_to_index.keys():
            embeddings[index] = model.word_vec(word)

    return embeddings, embedding_dim, processed_train, processed_valid, processed_test

In [8]:
# это код из тетрадки Семена Сорокина?

def padding(sequence, sequence_length=60):

    if len(sequence)< sequence_length:
        add_pad = sequence_length - len(sequence)
        return torch.cat((sequence, torch.tensor([0,]*add_pad)), 0)
    else:
        return sequence[:sequence_length]

In [9]:
class PairDataset(Dataset):
    def __init__(self, data, labels):
        self.labels = torch.tensor([int(l) for l in labels], dtype=torch.int64)
        self.left = [torch.tensor(row[0], dtype=torch.int64) for row in data]
        self.right = [torch.tensor(row[1], dtype=torch.int64) for row in data]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        
        label = self.labels[idx]
        left = self.left[idx]
        right = self.right[idx]

        return left, right, label

In [10]:
class SiameseClassifier(torch.nn.Module):
    def __init__(self, matrix, lstm_size): 
        super().__init__()
        
        self.emb_layer = torch.nn.Embedding.from_pretrained(torch.Tensor(matrix))
        self.LSTM = torch.nn.LSTM(300, lstm_size, num_layers=2, bidirectional=False)
        
        
    def forward(self, left, right):

        encoded_left = self.emb_layer(left)
        encoded_right = self.emb_layer(right)
        
        out_left, _ = self.LSTM(encoded_left)
        out_right, _ = self.LSTM(encoded_right)
        
        distance = torch.abs(torch.add(out_left, out_right.neg())).sum(axis=(1, 2)).neg().exp()

        return distance

In [11]:
# это код с ниса Семена Сорокина

def train_model(model, criterion, optimizer, epochs, train_loader, valid_loader):
    
    model = model.to(device)    
    criterion = criterion.to(device)

    losses = []
    best_test_loss = 10.

    test_f1 = []

    for n_epoch in tqdm(range(epochs)):

        train_losses = []
        test_losses = []

        model.train()

        for left, right, y in train_loader:

            left = left.to(device)
            right = right.to(device)
            y = torch.tensor(y, dtype=torch.float32)
            y = y.to(device)

            optimizer.zero_grad()

            pred = model(left, right)
            loss = criterion(pred, y)

            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())
            losses.append(loss.item())

        model.eval()

        for left, right, y in valid_loader:

            left = left.to(device)
            right = right.to(device)

            with torch.no_grad():

                pred = model(left, right)
                pred = pred.cpu()
                
                y = y.cpu()

                loss = criterion(pred, y)
                test_losses.append(loss.item())

        mean_test_loss = np.mean(test_losses)

        print()
        print('Losses: train - {:.3f}, test - {:.3f}'.format(np.mean(train_losses), mean_test_loss))

In [12]:
def test_accuracy(model, test_loader):
    test_targets = []
    test_losses = []
    criterion = torch.nn.MSELoss().to(device)
    logits = []

    for left, right, y in test_loader:

        left = left.to(device)
        right = right.to(device)
        y = torch.tensor(y, dtype=torch.float32)

        with torch.no_grad():
            
            pred = torch.squeeze(model(left, right))
            pred = pred.cpu()
            logits.append(pred)
            test_targets.append(y)

            loss = criterion(pred, y)
            test_losses.append(loss.item())

            
    double_pred_scores = [[1-s, s] for s in np.concatenate(logits).squeeze()]
    print('logloss: ', log_loss(y_pred = double_pred_scores, y_true = torch.cat(test_targets)))
    
    accuracy = [round(s) == y for s, y in zip(np.concatenate(logits).squeeze(), torch.cat(test_targets))]
    print('accuracy: ', accuracy.count(True)/len(accuracy))

In [13]:
def siamese_model(data, test_data, labels, wv):
    
    # turn into dataset with pairs
    train_pairs = make_pairs(data=data, labels=labels)
    test_pairs = make_pairs(data=test_data, labels=labels)
    valid_pairs = train_pairs.sample(frac = 0.1)
    train_pairs = train_pairs.drop(valid_pairs.index)
    
    # embeddings
    embeddings, embedding_dim, proc_train, proc_valid, proc_test = prepare_embeddings(model=wv, 
                                                                                 train=train_pairs, 
                                                                                 valid=valid_pairs,
                                                                                 test=test_pairs)
    # padding examples
    proc_train = [[padding(row[0]), padding(row[1])] for row in proc_train]
    proc_valid = [[padding(row[0]), padding(row[1])] for row in proc_valid]
    proc_test = [[padding(row[0]), padding(row[1])] for row in proc_test]
    
    
    # prepare dataloaders
    train_dataset = PairDataset(proc_train, train_pairs['labels'])
    train_loader = DataLoader(train_dataset, batch_size=64)

    valid_dataset = PairDataset(proc_valid, valid_pairs['labels'])
    valid_loader = DataLoader(valid_dataset, batch_size=64)

    test_dataset = PairDataset(proc_test, test_pairs['labels'])
    test_loader = DataLoader(test_dataset, batch_size=64)
    
    model = SiameseClassifier(embeddings, 100)
    criterion = torch.nn.MSELoss().to(device)
    optimizer = torch.optim.Adam(params=model.parameters())
    
    train_model(model=model, 
                criterion=criterion, 
                optimizer=optimizer, 
                epochs=30, 
                train_loader=train_loader,
                valid_loader=valid_loader)
    
    test_accuracy(model=model, test_loader=test_loader)

# main

In [14]:
siamese_model(data=data, test_data=test_data, labels=['Doyle', 'Christie'], wv=wv)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  embeddings[index] = model.word_vec(word)
  self.left = [torch.tensor(row[0], dtype=torch.int64) for row in data]
  self.right = [torch.tensor(row[1], dtype=torch.int64) for row in data]


  0%|          | 0/30 [00:00<?, ?it/s]

  y = torch.tensor(y, dtype=torch.float32)



Losses: train - 0.373, test - 0.353

Losses: train - 0.519, test - 0.481

Losses: train - 0.237, test - 0.421

Losses: train - 0.529, test - 0.474

Losses: train - 0.185, test - 0.447

Losses: train - 0.539, test - 0.425

Losses: train - 0.262, test - 0.448

Losses: train - 0.324, test - 0.430

Losses: train - 0.277, test - 0.430

Losses: train - 0.265, test - 0.463

Losses: train - 0.264, test - 0.471

Losses: train - 0.223, test - 0.443

Losses: train - 0.291, test - 0.386

Losses: train - 0.241, test - 0.417

Losses: train - 0.207, test - 0.491

Losses: train - 0.637, test - 0.466

Losses: train - 0.344, test - 0.471

Losses: train - 0.520, test - 0.459

Losses: train - 0.484, test - 0.454

Losses: train - 0.394, test - 0.398

Losses: train - 0.188, test - 0.469

Losses: train - 0.361, test - 0.438

Losses: train - 0.354, test - 0.423

Losses: train - 0.217, test - 0.459

Losses: train - 0.247, test - 0.450

Losses: train - 0.340, test - 0.433

Losses: train - 0.367, test - 0.429



  y = torch.tensor(y, dtype=torch.float32)
