In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import gensim.downloader
from torch.utils.data import Dataset, DataLoader

In [2]:
device = torch.device('cuda')

In [3]:
wv = gensim.downloader.load('word2vec-google-news-300')

# Load data

In [4]:
data = pd.read_csv("Doyle_Christie_dataset/train.csv")
data = data.drop(columns=['Unnamed: 0',])
data['labels'] = data['author'].apply(lambda x: 0 if x == 'Doyle' else 1)

In [5]:
test_data = pd.read_csv("Doyle_Christie_dataset/test.csv")
test_data = test_data.drop(columns=['Unnamed: 0',])
test_data['labels'] = test_data['author'].apply(lambda x: 0 if x == 'Doyle' else 1)

# Functions

In [6]:
# это код из статьи

def process_dataset(dataset, model, vocabulary, inverse_vocabulary):
    
    processed_dataset = []
    
    for index, row in tqdm(dataset.iterrows()):
        sentence = row['sentence']
        q2n = []  # q2n -> question numbers representation
        for word in sentence.lower().split():

            # Check for unwanted words
            if word not in model.key_to_index.keys():
                continue

            if word not in vocabulary:
                vocabulary[word] = len(inverse_vocabulary)
                q2n.append(len(inverse_vocabulary))
                inverse_vocabulary.append(word)
            else:
                q2n.append(vocabulary[word])
            
        processed_dataset.append(torch.tensor(q2n))
                    
    return vocabulary, inverse_vocabulary, processed_dataset


def prepare_embeddings(model, train, valid, test):
    vocabulary = dict()
    inverse_vocabulary = ['<unk>']

    # Iterate over the questions only of both training and test datasets
    vocabulary, inverse_vocabulary, processed_train = process_dataset(train, model, vocabulary, inverse_vocabulary)
    vocabulary, inverse_vocabulary, processed_valid = process_dataset(valid, model, vocabulary, inverse_vocabulary)
    vocabulary, inverse_vocabulary, processed_test = process_dataset(test, model, vocabulary, inverse_vocabulary)

    embedding_dim = model.vector_size
    embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim)  # This will be the embedding matrix
    embeddings[0] = 0  # So that the padding will be ignored

    # Build the embedding matrix
    for word, index in vocabulary.items():
        if word in model.key_to_index.keys():
            embeddings[index] = model.word_vec(word)

    return embeddings, embedding_dim, processed_train, processed_valid, processed_test

In [7]:
# это код из тетрадки Семена Сорокина?

def padding(sequence, sequence_length=60):

    if len(sequence)< sequence_length:
        add_pad = sequence_length - len(sequence)
        return torch.cat((sequence, torch.tensor([0,]*add_pad)), 0)
    else:
        return sequence[:sequence_length]

In [8]:
class GendersDataset(Dataset):
    def __init__(self, data, labels):
        self.labels = torch.tensor([int(l) for l in labels], dtype=torch.int64)
        self.data = [torch.tensor(row, dtype=torch.int64) for row in data]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        
        label = self.labels[idx]
        data = self.data[idx]

        return label, data

In [9]:
class CNN(nn.Module):
    def __init__(self, matrix, embedding_dim, n_filters, filter_sizes, output_dim, dropout_proba):
        super().__init__()
        
        self.embedding = torch.nn.Embedding.from_pretrained(torch.Tensor(matrix))
        
        self.conv_0 = nn.Conv2d(in_channels=1, 
                                out_channels=n_filters, 
                                kernel_size=(filter_sizes[0], embedding_dim))
        self.conv_1 = nn.Conv2d(in_channels=1, 
                                out_channels=n_filters, 
                                kernel_size=(filter_sizes[1], embedding_dim))
        self.conv_2 = nn.Conv2d(in_channels=1, 
                                out_channels=n_filters, 
                                kernel_size=(filter_sizes[2], embedding_dim))
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout_proba)
        
    def forward(self, x):
        #x = [batch, sent_len]
        embedded = self.embedding(x) # [batch, sent_len, emb_dim]

        embedded = embedded.unsqueeze(1) # [batch, 1, sent_len, emb]
        
        # self.conv_0(embedded).shape # [batch, n_filters, sent_len-1, 1]
               
        conved_0 = F.relu(self.conv_0(embedded).squeeze(3)) # [batch, n_filters, sent_len-1]
        conved_1 = F.relu(self.conv_1(embedded).squeeze(3)) # [batch, n_filters, sent_len-2]
        conved_2 = F.relu(self.conv_2(embedded).squeeze(3)) # [batch, n_filters, sent_len-3]
            
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2) # [batch, n_filters]
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2) # [batch, n_filters]
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2) # [batch, n_filters]
        
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim=1)) # [batch, 3*n_filters]
        
        return self.fc(cat)

In [15]:
# это код с ниса Семена Сорокина

def train_model(model, criterion, optimizer, epochs, train_loader, valid_loader):
    
    model = model.to(device)    
    criterion = criterion.to(device)

    losses = []
    best_test_loss = 10.

    test_f1 = []

    for n_epoch in tqdm(range(epochs)):

        train_losses = []
        test_losses = []

        model.train()

        for y, data in train_loader:

            data = data.to(device)
            y = torch.tensor(y, dtype=torch.float32)
            y = y.to(device) # [batch]
            
            optimizer.zero_grad()

            pred = torch.squeeze(model(data))
            loss = criterion(pred, y)

            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())
            losses.append(loss.item())

        model.eval()

        for y, data in valid_loader:

            data = data.to(device)
            y = torch.tensor(y, dtype=torch.float32)

            with torch.no_grad():


                pred = torch.squeeze(model(data))
                pred = pred.cpu()
                y = y.cpu()

                loss = criterion(pred, y)
                test_losses.append(loss.item())

        mean_test_loss = np.mean(test_losses)

        print()
        print('Losses: train - {:.3f}, test - {:.3f}'.format(np.mean(train_losses), mean_test_loss))

In [16]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc


def test_accuracy(model, test_loader):
    test_targets = []
    test_losses = []
    criterion = torch.nn.BCEWithLogitsLoss().to(device)
    preds = []

    for y, d in test_loader:

        d = d.to(device)
        y = torch.tensor(y, dtype=torch.float32)

        with torch.no_grad():

            pred = torch.squeeze(model(d))
            pred = pred.cpu()
            preds.append(pred)
            test_targets.append(y)

            loss = criterion(pred, y)
            test_losses.append(loss.item())
        
        
    print('accuracy: ', binary_accuracy(torch.cat(preds), torch.cat(test_targets)))

In [17]:
def main(data, test_data, wv):
    
    # split data into train and valid
    valid_data = data.sample(frac = 0.12)
    data = data.drop(valid_data.index)
    
    # embedding
    embeddings, embedding_dim, proc_train, proc_valid, proc_test = prepare_embeddings(model=wv, 
                                                                                     train=data, 
                                                                                     valid=valid_data,
                                                                                     test=test_data)
    
    # padding
    proc_train = [padding(row) for row in proc_train]
    proc_valid = [padding(row) for row in proc_valid]
    proc_test = [padding(row) for row in proc_test]
    
    # make dataloaders
    train_dataset = GendersDataset(proc_train, data['labels'])
    train_loader = DataLoader(train_dataset, batch_size=64)

    valid_dataset = GendersDataset(proc_valid, valid_data['labels'])
    valid_loader = DataLoader(valid_dataset, batch_size=64)

    test_dataset = GendersDataset(proc_test, test_data['labels'])
    test_loader = DataLoader(test_dataset, batch_size=64)
    
    
    # training
    model = CNN(embeddings, 300, 13, [2, 3, 4], 1, 0.25)
    train_model(model=model, 
                criterion=torch.nn.BCEWithLogitsLoss(),
                optimizer=torch.optim.Adam(params=model.parameters()),
                epochs=50,
                train_loader=train_loader,
                valid_loader=valid_loader)
    
    # accuracy
    test_accuracy(model, test_loader)

In [18]:
main(data=data, test_data=test_data, wv=wv)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  embeddings[index] = model.word_vec(word)
  self.data = [torch.tensor(row, dtype=torch.int64) for row in data]


  0%|          | 0/50 [00:00<?, ?it/s]

  y = torch.tensor(y, dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.float32)



Losses: train - 0.464, test - 3.139

Losses: train - 1.011, test - 1.483

Losses: train - 0.770, test - 1.520

Losses: train - 0.835, test - 0.744

Losses: train - 0.686, test - 0.739

Losses: train - 0.655, test - 1.141

Losses: train - 0.728, test - 0.747

Losses: train - 0.656, test - 0.763

Losses: train - 0.650, test - 0.782

Losses: train - 0.639, test - 0.830

Losses: train - 0.635, test - 0.775

Losses: train - 0.605, test - 1.204

Losses: train - 0.643, test - 0.820

Losses: train - 0.607, test - 0.807

Losses: train - 0.595, test - 0.803

Losses: train - 0.588, test - 0.880

Losses: train - 0.584, test - 0.837

Losses: train - 0.583, test - 0.819

Losses: train - 0.566, test - 0.875

Losses: train - 0.571, test - 0.989

Losses: train - 0.563, test - 0.841

Losses: train - 0.545, test - 0.838

Losses: train - 0.538, test - 0.871

Losses: train - 0.531, test - 0.869

Losses: train - 0.525, test - 0.908

Losses: train - 0.516, test - 0.941

Losses: train - 0.513, test - 0.882



  y = torch.tensor(y, dtype=torch.float32)
