# Multi Author Writing Style Analysis

The following notebook presents three different approaches to the problem of distingushing when in a sequence of paragraphs,
the author changes. The first approach disregrads the order of the paragraphs, opting instead to view samples as pairs of paragraphs.
It processes the paragraphs with a siamese network, which is a neural network that takes two inputs and outputs a single value.
The second approach adds a recurrent layer to the siamese network, allowing it to take into account a sequence of paragraphs.
The third approach builds on the second by augmenting the input with a manually engineered feature vector.

In [1]:
import numpy as np
import random
import pickle
import torch
import warnings
from torch.nn import functional as F
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn.metrics import f1_score
import pandas as pd
import time
from src.utils import get_data

In [None]:
warnings.filterwarnings('ignore')


## data

We load the data (uncomment to reconstruct the data from the raw files),
and make two data batch loaders:

1. pairs of paragraphs, which will be used for our baseline siamese network.
2. Sequences of paragraphs, which will be used for our recurrent siamese network.


In [None]:
# data = { str(i): get_data(i) for i in range(1, 4) }
# pickle.dump(data, open('data/data.pkl', 'wb'))
data = pickle.load(open('data/data.pkl', 'rb'))
dataset_1, dataset_2, dataset_3 = data['1'], data['2'], data['3']

In [None]:
def paired_samples(data_split):
    """turns data set into pair of consectuve sentences (flattens multi paragraph samples into pairs)"""
    pairs = []
    for problem_id in data_split.keys():
        texts = data_split[problem_id]['text']
        targets = data_split[problem_id]['truth']['changes']
        if len(texts) - 1 != len(targets):
            # TODO: fix. a few of the samples have more than one paragraph, making .readlines() wrong
            # print(f'problem {problem_id} has {len(texts)} texts and {len(targets)} targets')
            continue
        for target, text1, text2 in zip(targets, texts[:-1], texts[1:]):
            pairs.append((text1, text2, target))
    random.shuffle(pairs)
    return pairs

In [None]:
def get_pair_batches(data_split, device, batch_size=32):
    pairs = paired_samples(data_split)
    while True:
        # perm = np.random.permutation(len(pairs))
        x1 = torch.tensor(np.array([p[0] for p in pairs])).to(device)
        x2 = torch.tensor(np.array([p[1] for p in pairs])).to(device)
        y = torch.tensor(np.array([p[2] for p in pairs])).to(device)
        perm = torch.randperm(len(pairs))
        for i in range(0, len(pairs), batch_size):
            batch = perm[i:i+batch_size]
            yield (x1[batch], x2[batch]), y[batch]

In [None]:
def get_sequence_batches(data_split, device, batch_size=32):
    """turns data set into sequence of sentences (flattens multi paragraph samples into sequence)"""
    x, y = [], []
    for problem_id in data_split.keys():
        texts = data_split[problem_id]['text']
        targets = data_split[problem_id]['truth']['changes']
        if len(texts) - 1 != len(targets):
            continue
        x.append(torch.tensor(texts))
        y.append(torch.tensor(targets))
    while True:
        perm = torch.randperm(len(x))
        for i in range(0, len(x), batch_size):
            batch = perm[i:i+batch_size]
            x_batch = [x[i] for i in batch]
            y_batch = [y[i] for i in batch]
            y_batch = torch.cat(y_batch, dim=0).to(device)
            # pad with zero vectors
            x_batch = pad_sequence(x_batch, batch_first=True, padding_value=0).to(device)
            yield x_batch, y_batch

## models

We make our two models, the siamese network and the recurrent siamese network.

In [None]:
class SiameseNet(torch.nn.Module):
    def __init__(self, config, embed_dim=384):
        super().__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = config['hidden_dim']
        self.linear1 = torch.nn.Linear(self.embed_dim, self.hidden_dim)
        self.linear2 = torch.nn.Linear(self.hidden_dim, self.hidden_dim)
        self.linear3 = torch.nn.Linear(self.hidden_dim, 1)
        self.dropout = torch.nn.Dropout(config['dropout'])
        self.sigmoid = torch.nn.Sigmoid()
    
    def forward(self, x, y=None):
        x1, x2 = x
        x1_hat = self.dropout(x1)
        x1_hat = self.linear1(x1_hat)
        x1_hat = F.gelu(x1_hat)
        x2_hat = self.dropout(x2)
        x2_hat = self.linear1(x2_hat)
        x2_hat = F.gelu(x2_hat)
        y_hat = torch.abs(x1_hat - x2_hat)
        y_hat = self.linear2(y_hat)
        y_hat = F.gelu(y_hat)
        y_hat = self.dropout(y_hat)
        y_hat = self.linear3(y_hat)
        y_hat = self.sigmoid(y_hat)
        if y is not None:
            loss = torch.nn.functional.binary_cross_entropy(y_hat, y.float().unsqueeze(1))
            return y_hat, loss
        return y_hat

    def predict(self, x):
        y_hat = self.forward(x)
        y_hat = y_hat.squeeze(1)
        y_hat = (y_hat > 0.5).int()
        return y_hat

In [None]:
class RecurrentSiameseNet(torch.nn.Module):
    def __init__(self, config, embed_dim=384):
        super().__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = config['hidden_dim']
        self.gru = torch.nn.GRU(self.embed_dim, self.hidden_dim, batch_first=True)
        self.linear1 = torch.nn.Linear(self.hidden_dim, self.hidden_dim)
        self.linear2 = torch.nn.Linear(self.hidden_dim, 1)
        self.dropout = torch.nn.Dropout(config['dropout'])
    
    def forward(self, x, y=None):
        # x is tensor of shape (batch_size, seq_len, embed_dim)
        y_hat = self.dropout(x)
        mask = self.x_mask(y_hat)
        y_hat = self.gru(y_hat)[0]
        y_hat = y_hat.reshape(-1, self.hidden_dim)  # flatten for masking and y_hat
        y_hat = y_hat[mask]
        y_hat = F.gelu(y_hat)
        y_hat = self.linear1(y_hat)
        y_hat = self.dropout(y_hat)
        y_hat = F.gelu(y_hat)
        y_hat = self.linear2(y_hat)
        y_hat = F.sigmoid(y_hat)
        y_hat = y_hat.view(-1)
        if y is not None:
            try: 
                loss = torch.nn.functional.binary_cross_entropy(y_hat, y.float())
            except ValueError:
                plt.imshow(x.cpu().numpy().sum(axis=2))
                raise
                
            return y_hat, loss
        return y_hat

    def x_mask(self, x):
        """returns mask of shape (batch_size, seq_len)"""
        mask = torch.sum(x, dim=2) != 0
        mask[:, 0] = False
        mask = mask.view(-1)
        return mask

    def predict(self, x):
        y_hat = self.forward(x)
        y_hat = (y_hat > 0.5).int()
        return y_hat

## train

We define our training and evaluation functions, for use by both models, and all three datasets.

In [None]:
def training_curve(metrics):
    plt.style.use('dark_background')
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    axes[0].plot(metrics['train_loss'], label='train')
    axes[0].plot(metrics['valid_loss'], label='val')
    axes[0].set_title('Loss')
    axes[0].legend()
    axes[1].plot(metrics['train_f1'], label='train')
    axes[1].plot(metrics['valid_f1'], label='val')
    axes[1].set_title('F1')
    axes[1].legend()
    plt.show()

In [None]:
def evaluate(metrics, model, train_batches, valid_batches, steps=10):
    for batch_name, batches in [('train', train_batches), ('valid', valid_batches)]:
        loss, f1 = evaluate_split(model, batches, steps=steps)
        metrics[batch_name + '_loss'].append(loss)
        metrics[batch_name + '_f1'].append(f1)
        model.eval()
    model.train()
    return metrics

def evaluate_split(model, batches, steps):
    f1_scores, losses = [], []
    for i in range(steps):
        x, y = next(batches)
        y_hat, loss = model(x, y)
        losses.append(loss.item())
        y_hat = model.predict(x).cpu().numpy().astype(int)
        y = y.cpu().numpy().astype(int)
        f1_scores.append(f1_score(y, y_hat))
    return np.mean(losses), np.mean(f1_scores)

def train(model, optimizer, train_batches, valid_batches, n_steps):
    metrics = {'train_loss': [], 'train_f1': [], 'valid_loss': [], 'valid_f1': []}
    for i in range(n_steps):
        x, y = next(train_batches)
        y_hat, loss = model(x, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i % (n_steps // 100) == 0:
            metrics = evaluate(metrics, model, train_batches, valid_batches)
    final = evaluate(metrics, model, train_batches, valid_batches, steps=100)
    return metrics, {k: v[-1] for k, v in final.items()}

### experiment

To do hyper paramter tuning, and test performance of our models on our three datasets, we define the experiment functions.

In [None]:
def hyper_params():
    """return random hyperparameters"""
    return {
        'lr': 10 ** random.uniform(-5, -2),
        'dropout': random.uniform(0, 0.5),
        'hidden_dim': random.randint(64, 256),
        'batch_size': [16, 32, 64, 128][random.randint(0, 2)],
        'n_steps': [1000, 2000, 3000, 4000, 5000, 6000, 7000][random.randint(0, 2)]
    }

In [None]:
def hyper_param_search(model_fn, batch_fn, dataset, n_trials=10):
    """search hyperparameters for a given model and dataset"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    hyper_param_metrics = []
    for i in tqdm(range(n_trials)):
        config = hyper_params()
        model = model_fn(config).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
        train_batches = batch_fn(dataset['train'], device, config['batch_size'])
        valid_batches = batch_fn(dataset['valid'], device, config['batch_size'])
        _, final= train(model, optimizer, train_batches, valid_batches, config['n_steps'])
        hyper_param_metrics.append({**config, **final})
    df = pd.DataFrame(hyper_param_metrics).sort_values('valid_f1', ascending=False)
    return df

In [None]:
def experiment(n_trials=10):
    dfs = []
    for ds, ds_name in [(dataset_2, 'dataset_2'), (dataset_3, 'dataset_3')]:
        runs = [(RecurrentSiameseNet, get_sequence_batches, ds), (SiameseNet, get_pair_batches, ds)]
        for model, batch_fn, ds in runs:
            print(f'running {model.__name__}' + f" on {ds_name}")
            df = hyper_param_search(model, batch_fn, ds, n_trials)
            file_name = f'results/{model.__name__}_{ds_name}.csv'
            df.to_csv(file_name)
            dfs.append(df)
    return dfs

In [None]:
dfs = experiment(100)

running RecurrentSiameseNet on dataset_2


 29%|██▉       | 29/100 [07:19<16:52, 14.26s/it]

### analysis