In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import gensim.downloader
from torch.utils.data import Dataset, DataLoader

In [2]:
device = torch.device('cuda')

In [3]:
wv = gensim.downloader.load('word2vec-google-news-300')

# Load data

## Doyle & Christie

In [4]:
def process_Doyle_Christie():
    data = pd.read_csv("Doyle_Christie_dataset/train.csv")
    data = data.drop(columns=['Unnamed: 0',])
    data['labels'] = data['author'].apply(lambda x: 0 if x == 'Doyle' else 1)
    
    test_data = pd.read_csv("Doyle_Christie_dataset/test.csv")
    test_data = test_data.drop(columns=['Unnamed: 0',])
    test_data['labels'] = test_data['author'].apply(lambda x: 0 if x == 'Doyle' else 1)
    
    return data, test_data

In [5]:
train_dc, test_dc = process_Doyle_Christie()

## Letters

In [6]:
def process_Letters():
    old_eng = pd.read_csv('old_english_dataset.csv')
    old_eng = old_eng.drop(columns=['Unnamed: 0',])
    old_eng['labels'] = old_eng['gender'].apply(lambda x: 0 if x == 'f' else 1)
    
    equal = old_eng[old_eng['labels'] == 1].sample(n = 2806)
    train_old_eng = pd.concat([equal, old_eng[old_eng['labels'] == 0]], ignore_index=True)
    
    women = train_old_eng[train_old_eng['labels'] == 0]
    men = train_old_eng[train_old_eng['labels'] == 1]
    
    test_women = women.sample(frac=0.1)
    train_women = women.drop(test_women.index)
    test_men = men.sample(frac=0.1)
    train_men = men.drop(test_men.index)
    
    train_old_eng = pd.concat([train_women, train_men], ignore_index=True)
    test_old_eng = pd.concat([test_women, test_men], ignore_index=True)
    
    return train_old_eng, test_old_eng

In [7]:
train_letters, test_letters = process_Letters()

## Modern

In [4]:
def process_Modern():
    data = pd.read_csv("Modern_dataset/train.csv")
    data = data.drop(columns=['Unnamed: 0',])
    data['labels'] = data['gender'].apply(lambda x: 0 if x == 'f' else 1)
    
    train_data = pd.concat([data[data['labels'] == 1].sample(n=30941), data[data['labels']==0]], 
                       ignore_index=True)
    
    test = pd.read_csv("Modern_dataset/test.csv")
    test = test.drop(columns=['Unnamed: 0',])
    test['labels'] = test['gender'].apply(lambda x: 0 if x == 'f' else 1)
    
    test_data = pd.concat([test[test['labels'] == 0].sample(n=5000), test[test['labels'] == 1].sample(n=5000)], 
                      ignore_index=True)
    
    return train_data, test_data

In [5]:
train_modern, test_modern = process_Modern()

# Functions

In [6]:
# это код из статьи

def process_dataset(dataset, model, vocabulary, inverse_vocabulary):
    
    processed_dataset = []
    
    for index, row in tqdm(dataset.iterrows()):
        sentence = row['sentence']
        q2n = []  # q2n -> question numbers representation
        for word in sentence.lower().split():

            # Check for unwanted words
            if word not in model.key_to_index.keys():
                continue

            if word not in vocabulary:
                vocabulary[word] = len(inverse_vocabulary)
                q2n.append(len(inverse_vocabulary))
                inverse_vocabulary.append(word)
            else:
                q2n.append(vocabulary[word])
            
        processed_dataset.append(torch.tensor(q2n))
                    
    return vocabulary, inverse_vocabulary, processed_dataset


def prepare_embeddings(model, train, valid, test):
    vocabulary = dict()
    inverse_vocabulary = ['<unk>']

    # Iterate over the questions only of both training and test datasets
    vocabulary, inverse_vocabulary, processed_train = process_dataset(train, model, vocabulary, inverse_vocabulary)
    vocabulary, inverse_vocabulary, processed_valid = process_dataset(valid, model, vocabulary, inverse_vocabulary)
    vocabulary, inverse_vocabulary, processed_test = process_dataset(test, model, vocabulary, inverse_vocabulary)

    embedding_dim = model.vector_size
    embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim)  # This will be the embedding matrix
    embeddings[0] = 0  # So that the padding will be ignored

    # Build the embedding matrix
    for word, index in vocabulary.items():
        if word in model.key_to_index.keys():
            embeddings[index] = model.word_vec(word)

    return embeddings, embedding_dim, processed_train, processed_valid, processed_test

In [7]:
# это код из тетрадки Семена Сорокина?

def padding(sequence, sequence_length=60):

    if len(sequence)< sequence_length:
        add_pad = sequence_length - len(sequence)
        return torch.cat((sequence, torch.tensor([0,]*add_pad)), 0)
    else:
        return sequence[:sequence_length]

In [8]:
class GendersDataset(Dataset):
    def __init__(self, data, labels):
        self.labels = torch.tensor([int(l) for l in labels], dtype=torch.int64)
        self.data = [torch.tensor(row, dtype=torch.int64) for row in data]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        
        label = self.labels[idx]
        data = self.data[idx]

        return label, data

In [9]:
class CNN(nn.Module):
    def __init__(self, matrix, embedding_dim, n_filters, filter_sizes, output_dim, dropout_proba):
        super().__init__()
        
        self.embedding = torch.nn.Embedding.from_pretrained(torch.Tensor(matrix))
        
        self.conv_0 = nn.Conv2d(in_channels=1, 
                                out_channels=n_filters, 
                                kernel_size=(filter_sizes[0], embedding_dim))
        self.conv_1 = nn.Conv2d(in_channels=1, 
                                out_channels=n_filters, 
                                kernel_size=(filter_sizes[1], embedding_dim))
        self.conv_2 = nn.Conv2d(in_channels=1, 
                                out_channels=n_filters, 
                                kernel_size=(filter_sizes[2], embedding_dim))
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout_proba)
        
    def forward(self, x):
        #x = [batch, sent_len]
        embedded = self.embedding(x) # [batch, sent_len, emb_dim]

        embedded = embedded.unsqueeze(1) # [batch, 1, sent_len, emb]
        
        # self.conv_0(embedded).shape # [batch, n_filters, sent_len-1, 1]
               
        conved_0 = F.relu(self.conv_0(embedded).squeeze(3)) # [batch, n_filters, sent_len-1]
        conved_1 = F.relu(self.conv_1(embedded).squeeze(3)) # [batch, n_filters, sent_len-2]
        conved_2 = F.relu(self.conv_2(embedded).squeeze(3)) # [batch, n_filters, sent_len-3]
            
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2) # [batch, n_filters]
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2) # [batch, n_filters]
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2) # [batch, n_filters]
        
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim=1)) # [batch, 3*n_filters]
        
        return self.fc(cat)

In [10]:
# это код с ниса Семена Сорокина

def train_model(model, criterion, optimizer, epochs, train_loader, valid_loader):
    
    model = model.to(device)    
    criterion = criterion.to(device)

    losses = []
    best_test_loss = 10.

    test_f1 = []

    for n_epoch in tqdm(range(epochs)):

        train_losses = []
        test_losses = []

        model.train()

        for y, data in train_loader:

            data = data.to(device)
            y = torch.tensor(y, dtype=torch.float32)
            y = y.to(device) # [batch]
            
            optimizer.zero_grad()

            pred = torch.squeeze(model(data))
            loss = criterion(pred, y)

            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())
            losses.append(loss.item())

        model.eval()

        for y, data in valid_loader:

            data = data.to(device)
            y = torch.tensor(y, dtype=torch.float32)

            with torch.no_grad():


                pred = torch.squeeze(model(data))
                pred = pred.cpu()
                y = y.cpu()

                loss = criterion(pred, y)
                test_losses.append(loss.item())

        mean_test_loss = np.mean(test_losses)

        print()
        print('Losses: train - {:.3f}, test - {:.3f}'.format(np.mean(train_losses), mean_test_loss))

In [11]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc


def test_accuracy(model, test_loader):
    test_targets = []
    test_losses = []
    criterion = torch.nn.BCEWithLogitsLoss().to(device)
    preds = []

    for y, d in test_loader:

        d = d.to(device)
        y = torch.tensor(y, dtype=torch.float32)

        with torch.no_grad():

            pred = torch.squeeze(model(d))
            pred = pred.cpu()
            preds.append(pred)
            test_targets.append(y)

            loss = criterion(pred, y)
            test_losses.append(loss.item())
        
        
    print('accuracy: ', binary_accuracy(torch.cat(preds), torch.cat(test_targets)))

In [12]:
def CNN_pipeline(data, test_data, wv):
    
    # split data into train and valid
    valid_data = data.sample(frac = 0.12)
    data = data.drop(valid_data.index)
    
    # embedding
    embeddings, embedding_dim, proc_train, proc_valid, proc_test = prepare_embeddings(model=wv, 
                                                                                     train=data, 
                                                                                     valid=valid_data,
                                                                                     test=test_data)
    
    # padding
    proc_train = [padding(row) for row in proc_train]
    proc_valid = [padding(row) for row in proc_valid]
    proc_test = [padding(row) for row in proc_test]
    
    # make dataloaders
    train_dataset = GendersDataset(proc_train, data['labels'])
    train_loader = DataLoader(train_dataset, batch_size=64)

    valid_dataset = GendersDataset(proc_valid, valid_data['labels'])
    valid_loader = DataLoader(valid_dataset, batch_size=64)

    test_dataset = GendersDataset(proc_test, test_data['labels'])
    test_loader = DataLoader(test_dataset, batch_size=64)
    
    
    # training
    model = CNN(embeddings, 300, 13, [2, 3, 4], 1, 0.25)
    train_model(model=model, 
                criterion=torch.nn.BCEWithLogitsLoss(),
                optimizer=torch.optim.Adam(params=model.parameters()),
                epochs=50,
                train_loader=train_loader,
                valid_loader=valid_loader)
    
    # accuracy
    test_accuracy(model, test_loader)

In [15]:
CNN_pipeline(data=train_dc, test_data=test_dc, wv=wv)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  embeddings[index] = model.word_vec(word)
  self.data = [torch.tensor(row, dtype=torch.int64) for row in data]


  0%|          | 0/50 [00:00<?, ?it/s]

  y = torch.tensor(y, dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.float32)



Losses: train - 0.484, test - 3.020

Losses: train - 0.960, test - 1.244

Losses: train - 0.832, test - 0.708

Losses: train - 0.637, test - 2.845

Losses: train - 0.856, test - 2.111

Losses: train - 0.848, test - 1.002

Losses: train - 0.694, test - 0.755

Losses: train - 0.661, test - 0.791

Losses: train - 0.641, test - 1.260

Losses: train - 0.702, test - 0.845

Losses: train - 0.639, test - 0.770

Losses: train - 0.623, test - 0.774

Losses: train - 0.616, test - 0.775

Losses: train - 0.606, test - 0.923

Losses: train - 0.610, test - 0.817

Losses: train - 0.623, test - 0.916

Losses: train - 0.595, test - 0.769

Losses: train - 0.583, test - 0.787

Losses: train - 0.619, test - 1.070

Losses: train - 0.581, test - 0.794

Losses: train - 0.564, test - 0.841

Losses: train - 0.551, test - 1.004

Losses: train - 0.551, test - 0.848

Losses: train - 0.537, test - 0.836

Losses: train - 0.533, test - 0.841

Losses: train - 0.522, test - 0.872

Losses: train - 0.531, test - 0.917



  y = torch.tensor(y, dtype=torch.float32)


In [16]:
CNN_pipeline(data=train_letters, test_data=test_letters, wv=wv)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  embeddings[index] = model.word_vec(word)
  self.data = [torch.tensor(row, dtype=torch.int64) for row in data]


  0%|          | 0/50 [00:00<?, ?it/s]

  y = torch.tensor(y, dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.float32)



Losses: train - 0.878, test - 0.692

Losses: train - 0.700, test - 0.690

Losses: train - 0.696, test - 0.683

Losses: train - 0.705, test - 0.677

Losses: train - 0.678, test - 0.715

Losses: train - 0.728, test - 0.663

Losses: train - 0.693, test - 0.648

Losses: train - 0.671, test - 0.634

Losses: train - 0.660, test - 0.629

Losses: train - 0.647, test - 0.625

Losses: train - 0.642, test - 0.625

Losses: train - 0.633, test - 0.623

Losses: train - 0.630, test - 0.609

Losses: train - 0.612, test - 0.606

Losses: train - 0.608, test - 0.597

Losses: train - 0.588, test - 0.595

Losses: train - 0.583, test - 0.619

Losses: train - 0.584, test - 0.588

Losses: train - 0.565, test - 0.581

Losses: train - 0.548, test - 0.576

Losses: train - 0.536, test - 0.575

Losses: train - 0.532, test - 0.574

Losses: train - 0.521, test - 0.568

Losses: train - 0.508, test - 0.572

Losses: train - 0.499, test - 0.570

Losses: train - 0.487, test - 0.567

Losses: train - 0.477, test - 0.570



  y = torch.tensor(y, dtype=torch.float32)


In [13]:
CNN_pipeline(data=train_modern, test_data=test_modern, wv=wv)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  embeddings[index] = model.word_vec(word)
  self.data = [torch.tensor(row, dtype=torch.int64) for row in data]


  0%|          | 0/50 [00:00<?, ?it/s]

  y = torch.tensor(y, dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.float32)



Losses: train - 0.205, test - 5.804

Losses: train - 0.495, test - 6.624

Losses: train - 0.515, test - 4.577

Losses: train - 0.481, test - 3.858

Losses: train - 0.461, test - 2.693

Losses: train - 0.429, test - 2.746

Losses: train - 0.425, test - 4.291

Losses: train - 0.429, test - 3.921

Losses: train - 0.402, test - 3.353

Losses: train - 0.391, test - 4.085

Losses: train - 0.377, test - 3.793

Losses: train - 0.457, test - 2.875

Losses: train - 0.411, test - 2.539

Losses: train - 0.415, test - 3.595

Losses: train - 0.373, test - 3.069

Losses: train - 0.475, test - 3.246

Losses: train - 0.400, test - 2.672

Losses: train - 0.334, test - 4.056

Losses: train - 0.397, test - 2.790

Losses: train - 0.372, test - 3.472

Losses: train - 0.361, test - 4.894

Losses: train - 0.415, test - 3.114

Losses: train - 0.345, test - 2.728

Losses: train - 0.347, test - 2.780

Losses: train - 0.379, test - 3.329

Losses: train - 0.365, test - 2.759

Losses: train - 0.342, test - 3.082



  y = torch.tensor(y, dtype=torch.float32)
