In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import torch
import numpy as np
import gensim.downloader
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import log_loss

In [2]:
device = torch.device('cuda')

In [3]:
wv = gensim.downloader.load('word2vec-google-news-300')

# Load data

## Doyle & Christie

In [4]:
def process_Doyle_Christie():
    data = pd.read_csv("Doyle_Christie_dataset/train.csv")
    data = data.drop(columns=['Unnamed: 0',])
    data['labels'] = data['author'].apply(lambda x: 0 if x == 'Doyle' else 1)
    
    test_data = pd.read_csv("Doyle_Christie_dataset/test.csv")
    test_data = test_data.drop(columns=['Unnamed: 0',])
    test_data['labels'] = test_data['author'].apply(lambda x: 0 if x == 'Doyle' else 1)
    
    return data, test_data

In [5]:
train_dc, test_dc = process_Doyle_Christie()

## Letters

In [6]:
def process_Letters():
    old_eng = pd.read_csv('old_english_dataset.csv')
    old_eng = old_eng.drop(columns=['Unnamed: 0',])
    old_eng['labels'] = old_eng['gender'].apply(lambda x: 0 if x == 'f' else 1)
    
    equal = old_eng[old_eng['labels'] == 1].sample(n = 2806)
    train_old_eng = pd.concat([equal, old_eng[old_eng['labels'] == 0]], ignore_index=True)
    
    women = train_old_eng[train_old_eng['labels'] == 0]
    men = train_old_eng[train_old_eng['labels'] == 1]
    
    test_women = women.sample(frac=0.1)
    train_women = women.drop(test_women.index)
    test_men = men.sample(frac=0.1)
    train_men = men.drop(test_men.index)
    
    train_old_eng = pd.concat([train_women, train_men], ignore_index=True)
    test_old_eng = pd.concat([test_women, test_men], ignore_index=True)
    
    return train_old_eng, test_old_eng

In [7]:
train_letters, test_letters = process_Letters()

## Modern

In [4]:
def process_Modern():
    data = pd.read_csv("Modern_dataset/train.csv")
    data = data.drop(columns=['Unnamed: 0',])
    data['labels'] = data['gender'].apply(lambda x: 0 if x == 'f' else 1)
    
    train_data = pd.concat([data[data['labels'] == 1].sample(n=30941), data[data['labels']==0]], 
                       ignore_index=True)
    
    test = pd.read_csv("Modern_dataset/test.csv")
    test = test.drop(columns=['Unnamed: 0',])
    test['labels'] = test['gender'].apply(lambda x: 0 if x == 'f' else 1)
    
    test_data = pd.concat([test[test['labels'] == 0].sample(n=5000), test[test['labels'] == 1].sample(n=5000)], 
                      ignore_index=True)
    
    return train_data, test_data

In [5]:
train_modern, test_modern = process_Modern()

# Functions

In [6]:
def make_pairs(data, labels, column):
    doyle = data[data[column] == labels[0]]
    christie = data[data[column] == labels[1]]
    
    doyle_true = doyle.sample(frac = 0.5)
    doyle_false = doyle.drop(doyle_true.index)
    
    christie_true = christie.sample(frac = 0.5)
    christie_false = christie.drop(christie_true.index)
    
    doyle_left = doyle_true.sample(frac=0.5)
    doyle_right = doyle_true.drop(doyle_left.index)
    
    christie_left = christie_true.sample(frac=0.5)
    christie_right = christie_true.drop(christie_left.index)
    
    false_pair_data = pd.DataFrame({'sentence1':list(doyle_false['sentence']), 
                                'sentence2':list(christie_false['sentence']),
                               'labels':['0' for i in range(len(doyle_false))]})
    
    doyle_pair_data = pd.DataFrame({'sentence1':list(doyle_left['sentence']), 
                                'sentence2':list(doyle_right['sentence']),
                               'labels':['1' for i in range(len(doyle_right))]})
    
    christie_pair_data = pd.DataFrame({'sentence1':list(christie_left['sentence']), 
                                'sentence2':list(christie_right['sentence']),
                               'labels':['1' for i in range(len(christie_right))]})
    
    pairs = pd.concat([false_pair_data, doyle_pair_data, christie_pair_data], ignore_index=True)
    
    return pairs

In [7]:
# это код из статьи

def process_dataset(dataset, model, vocabulary, inverse_vocabulary):
    
    processed_dataset = []
    
    for index, row in tqdm(dataset.iterrows()):
        pair = []
        questions = [row['sentence1'], row['sentence2']]
        for question in questions:

            q2n = []  # q2n -> question numbers representation
            for word in question.lower().split():

                # Check for unwanted words
                if word not in model.key_to_index.keys():
                    continue

                if word not in vocabulary:
                    vocabulary[word] = len(inverse_vocabulary)
                    q2n.append(len(inverse_vocabulary))
                    inverse_vocabulary.append(word)
                else:
                    q2n.append(vocabulary[word])
            
            pair.append(torch.tensor(q2n))
        processed_dataset.append(pair)
                    
    return vocabulary, inverse_vocabulary, processed_dataset


def prepare_embeddings(model, train, valid, test):
    vocabulary = dict()
    inverse_vocabulary = ['<unk>']

    # Iterate over the questions only of both training and test datasets
    vocabulary, inverse_vocabulary, processed_train = process_dataset(train, model, vocabulary, inverse_vocabulary)
    vocabulary, inverse_vocabulary, processed_valid = process_dataset(valid, model, vocabulary, inverse_vocabulary)
    vocabulary, inverse_vocabulary, processed_test = process_dataset(test, model, vocabulary, inverse_vocabulary)

    embedding_dim = model.vector_size
    embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim)  # This will be the embedding matrix
    embeddings[0] = 0  # So that the padding will be ignored

    # Build the embedding matrix
    for word, index in vocabulary.items():
        if word in model.key_to_index.keys():
            embeddings[index] = model.word_vec(word)

    return embeddings, embedding_dim, processed_train, processed_valid, processed_test

In [8]:
# это код из тетрадки Семена Сорокина?

def padding(sequence, sequence_length=60):

    if len(sequence)< sequence_length:
        add_pad = sequence_length - len(sequence)
        return torch.cat((sequence, torch.tensor([0,]*add_pad)), 0)
    else:
        return sequence[:sequence_length]

In [9]:
class PairDataset(Dataset):
    def __init__(self, data, labels):
        self.labels = torch.tensor([int(l) for l in labels], dtype=torch.int64)
        self.left = [torch.tensor(row[0], dtype=torch.int64) for row in data]
        self.right = [torch.tensor(row[1], dtype=torch.int64) for row in data]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        
        label = self.labels[idx]
        left = self.left[idx]
        right = self.right[idx]

        return left, right, label

In [10]:
class SiameseClassifier(torch.nn.Module):
    def __init__(self, matrix, lstm_size): 
        super().__init__()
        
        self.emb_layer = torch.nn.Embedding.from_pretrained(torch.Tensor(matrix))
        self.LSTM = torch.nn.LSTM(300, lstm_size, num_layers=2, bidirectional=False)
        
        
    def forward(self, left, right):

        encoded_left = self.emb_layer(left)
        encoded_right = self.emb_layer(right)
        
        out_left, _ = self.LSTM(encoded_left)
        out_right, _ = self.LSTM(encoded_right)
        
        distance = torch.abs(torch.add(out_left, out_right.neg())).sum(axis=(1, 2)).neg().exp()

        return distance

In [11]:
# это код с ниса Семена Сорокина

def train_model(model, criterion, optimizer, epochs, train_loader, valid_loader):
    
    model = model.to(device)    
    criterion = criterion.to(device)

    losses = []
    best_test_loss = 10.

    test_f1 = []

    for n_epoch in tqdm(range(epochs)):

        train_losses = []
        test_losses = []

        model.train()

        for left, right, y in train_loader:

            left = left.to(device)
            right = right.to(device)
            y = torch.tensor(y, dtype=torch.float32)
            y = y.to(device)

            optimizer.zero_grad()

            pred = model(left, right)
            loss = criterion(pred, y)

            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())
            losses.append(loss.item())

        model.eval()

        for left, right, y in valid_loader:

            left = left.to(device)
            right = right.to(device)

            with torch.no_grad():

                pred = model(left, right)
                pred = pred.cpu()
                
                y = y.cpu()

                loss = criterion(pred, y)
                test_losses.append(loss.item())

        mean_test_loss = np.mean(test_losses)

        print()
        print('Losses: train - {:.3f}, test - {:.3f}'.format(np.mean(train_losses), mean_test_loss))

In [12]:
def test_accuracy(model, test_loader):
    test_targets = []
    test_losses = []
    criterion = torch.nn.MSELoss().to(device)
    logits = []

    for left, right, y in test_loader:

        left = left.to(device)
        right = right.to(device)
        y = torch.tensor(y, dtype=torch.float32)

        with torch.no_grad():
            
            pred = torch.squeeze(model(left, right))
            pred = pred.cpu()
            logits.append(pred)
            test_targets.append(y)

            loss = criterion(pred, y)
            test_losses.append(loss.item())

            
    double_pred_scores = [[1-s, s] for s in np.concatenate(logits).squeeze()]
    print('logloss: ', log_loss(y_pred = double_pred_scores, y_true = torch.cat(test_targets)))
    
    accuracy = [round(s) == y for s, y in zip(np.concatenate(logits).squeeze(), torch.cat(test_targets))]
    print('accuracy: ', accuracy.count(True)/len(accuracy))

In [13]:
def Siamese_pipeline(data, test_data, labels, column, wv):
    
    # turn into dataset with pairs
    train_pairs = make_pairs(data=data, labels=labels, column=column)
    test_pairs = make_pairs(data=test_data, labels=labels, column=column)
    valid_pairs = train_pairs.sample(frac = 0.1)
    train_pairs = train_pairs.drop(valid_pairs.index)
    
    # embeddings
    embeddings, embedding_dim, proc_train, proc_valid, proc_test = prepare_embeddings(model=wv, 
                                                                                 train=train_pairs, 
                                                                                 valid=valid_pairs,
                                                                                 test=test_pairs)
    # padding examples
    proc_train = [[padding(row[0]), padding(row[1])] for row in proc_train]
    proc_valid = [[padding(row[0]), padding(row[1])] for row in proc_valid]
    proc_test = [[padding(row[0]), padding(row[1])] for row in proc_test]
    
    
    # prepare dataloaders
    train_dataset = PairDataset(proc_train, train_pairs['labels'])
    train_loader = DataLoader(train_dataset, batch_size=64)

    valid_dataset = PairDataset(proc_valid, valid_pairs['labels'])
    valid_loader = DataLoader(valid_dataset, batch_size=64)

    test_dataset = PairDataset(proc_test, test_pairs['labels'])
    test_loader = DataLoader(test_dataset, batch_size=64)
    
    model = SiameseClassifier(embeddings, 100)
    criterion = torch.nn.MSELoss().to(device)
    optimizer = torch.optim.Adam(params=model.parameters())
    
    train_model(model=model, 
                criterion=criterion, 
                optimizer=optimizer, 
                epochs=30, 
                train_loader=train_loader,
                valid_loader=valid_loader)
    
    test_accuracy(model=model, test_loader=test_loader)

# main

In [16]:
Siamese_pipeline(data=train_dc, test_data=test_dc, labels=['Doyle', 'Christie'], column='author', wv=wv)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  embeddings[index] = model.word_vec(word)
  self.left = [torch.tensor(row[0], dtype=torch.int64) for row in data]
  self.right = [torch.tensor(row[1], dtype=torch.int64) for row in data]


  0%|          | 0/30 [00:00<?, ?it/s]

  y = torch.tensor(y, dtype=torch.float32)



Losses: train - 0.339, test - 0.356

Losses: train - 0.515, test - 0.506

Losses: train - 0.232, test - 0.394

Losses: train - 0.517, test - 0.500

Losses: train - 0.236, test - 0.402

Losses: train - 0.510, test - 0.418

Losses: train - 0.260, test - 0.399

Losses: train - 0.520, test - 0.477

Losses: train - 0.225, test - 0.410

Losses: train - 0.501, test - 0.332

Losses: train - 0.297, test - 0.401

Losses: train - 0.407, test - 0.364

Losses: train - 0.367, test - 0.364

Losses: train - 0.598, test - 0.501

Losses: train - 0.349, test - 0.334

Losses: train - 0.301, test - 0.374

Losses: train - 0.269, test - 0.411

Losses: train - 0.378, test - 0.387

Losses: train - 0.307, test - 0.422

Losses: train - 0.285, test - 0.429

Losses: train - 0.426, test - 0.348

Losses: train - 0.252, test - 0.383

Losses: train - 0.227, test - 0.435

Losses: train - 0.372, test - 0.376

Losses: train - 0.259, test - 0.408

Losses: train - 0.375, test - 0.379

Losses: train - 0.262, test - 0.412



  y = torch.tensor(y, dtype=torch.float32)


In [17]:
Siamese_pipeline(data=train_letters, test_data=test_letters, labels=['f', 'm'], column='gender', wv=wv)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  embeddings[index] = model.word_vec(word)
  self.left = [torch.tensor(row[0], dtype=torch.int64) for row in data]
  self.right = [torch.tensor(row[1], dtype=torch.int64) for row in data]


  0%|          | 0/30 [00:00<?, ?it/s]

  y = torch.tensor(y, dtype=torch.float32)



Losses: train - 0.509, test - 0.484

Losses: train - 0.508, test - 0.473

Losses: train - 0.437, test - 0.263

Losses: train - 0.519, test - 0.480

Losses: train - 0.410, test - 0.253

Losses: train - 0.479, test - 0.302

Losses: train - 0.473, test - 0.294

Losses: train - 0.507, test - 0.440

Losses: train - 0.328, test - 0.269

Losses: train - 0.537, test - 0.460

Losses: train - 0.345, test - 0.259

Losses: train - 0.516, test - 0.415

Losses: train - 0.304, test - 0.280

Losses: train - 0.519, test - 0.410

Losses: train - 0.310, test - 0.276

Losses: train - 0.523, test - 0.407

Losses: train - 0.314, test - 0.274

Losses: train - 0.535, test - 0.438

Losses: train - 0.346, test - 0.263

Losses: train - 0.516, test - 0.395

Losses: train - 0.318, test - 0.273

Losses: train - 0.503, test - 0.384

Losses: train - 0.328, test - 0.272

Losses: train - 0.517, test - 0.355

Losses: train - 0.327, test - 0.261

Losses: train - 0.439, test - 0.258

Losses: train - 0.346, test - 0.254



  y = torch.tensor(y, dtype=torch.float32)


In [14]:
Siamese_pipeline(data=train_modern, test_data=test_modern, labels=['f', 'm'], column='gender', wv=wv)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  embeddings[index] = model.word_vec(word)
  self.left = [torch.tensor(row[0], dtype=torch.int64) for row in data]
  self.right = [torch.tensor(row[1], dtype=torch.int64) for row in data]


  0%|          | 0/30 [00:00<?, ?it/s]

  y = torch.tensor(y, dtype=torch.float32)



Losses: train - 0.159, test - 0.470

Losses: train - 0.276, test - 0.469

Losses: train - 0.224, test - 0.477

Losses: train - 0.150, test - 0.485

Losses: train - 0.091, test - 0.479

Losses: train - 0.134, test - 0.484

Losses: train - 0.137, test - 0.486

Losses: train - 0.141, test - 0.484

Losses: train - 0.116, test - 0.485

Losses: train - 0.165, test - 0.490

Losses: train - 0.092, test - 0.484

Losses: train - 0.113, test - 0.483

Losses: train - 0.102, test - 0.486

Losses: train - 0.090, test - 0.488

Losses: train - 0.140, test - 0.487

Losses: train - 0.183, test - 0.485

Losses: train - 0.076, test - 0.488

Losses: train - 0.071, test - 0.489

Losses: train - 0.084, test - 0.489

Losses: train - 0.055, test - 0.491

Losses: train - 0.095, test - 0.488

Losses: train - 0.063, test - 0.486

Losses: train - 0.074, test - 0.487

Losses: train - 0.097, test - 0.483

Losses: train - 0.070, test - 0.485

Losses: train - 0.099, test - 0.486

Losses: train - 0.075, test - 0.487



  y = torch.tensor(y, dtype=torch.float32)
