# Multi Author Writing Style Analysis

The following notebook presents three different approaches to the problem of distingushing when in a sequence of paragraphs,
the author changes. The first approach disregrads the order of the paragraphs, opting instead to view samples as pairs of paragraphs.
It processes the paragraphs with a siamese network, which is a neural network that takes two inputs and outputs a single value.
The second approach adds a recurrent layer to the siamese network, allowing it to take into account a sequence of paragraphs.
The third approach builds on the second by augmenting the input with a manually engineered feature vector.

In [19]:
import numpy as np
import random
import pickle
import torch
import warnings
from functools import partial
from torch.nn import functional as F
from torch.nn.utils.rnn import pad_sequence
import itertools
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn.metrics import f1_score, accuracy_score
import pandas as pd
import nltk
import yaml
from src.utils import get_data

In [20]:
warnings.filterwarnings('ignore')


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Data

We load the data (uncomment to reconstruct the data from the raw files),
and make two data batch loaders:

1. pairs of paragraphs, which will be used for our baseline siamese network.
2. Sequences of paragraphs, which will be used for our recurrent siamese network.


In [4]:
# nltk.download('averaged_perceptron_tagger')
# data = { str(i): get_data(i) for i in range(1, 4) }
# pickle.dump(data, open('data/data.pkl', 'wb'))
data = pickle.load(open('data/data.pkl', 'rb'))
dataset_1, dataset_2, dataset_3 = data['1'], data['2'], data['3']

In [5]:
def paired_samples(data_split, syntactic_flag):
    """turns data set into pair of consectuve sentences (flattens multi paragraph samples into pairs)"""
    pairs = []
    for problem_id in data_split.keys():
        semantic = data_split[problem_id]['semantic']
        syntactic = data_split[problem_id]['syntactic']
        # concatenate all symantic and syntactic features into one vecntor per sample
        if syntactic_flag:
            texts = [np.concatenate([semantic[i], syntactic[i]]) for i in range(len(semantic))]
        else:
            texts = data_split[problem_id]['semantic']
        targets = data_split[problem_id]['truth']['changes']
        if len(texts) - 1 != len(targets):
            # TODO: fix. a few of the samples have more than one paragraph, making .readlines() wrong
            # print(f'problem {problem_id} has {len(texts)} texts and {len(targets)} targets')
            continue
        for target, text1, text2 in zip(targets, texts[:-1], texts[1:]):
            pairs.append((text1, text2, target))
    random.shuffle(pairs)
    return pairs

In [6]:
def get_pair_batches(data_split, syntactic_flag=True, batch_size=32):
    pairs = paired_samples(data_split, syntactic_flag)
    while True:
        # perm = np.random.permutation(len(pairs))
        x1 = torch.tensor(np.array([p[0] for p in pairs])).float().to(device)
        x2 = torch.tensor(np.array([p[1] for p in pairs])).float().to(device)
        y = torch.tensor(np.array([p[2] for p in pairs])).float().to(device)
        perm = torch.randperm(len(pairs))
        for i in range(0, len(pairs), batch_size):
            batch = perm[i:i+batch_size]
            yield (x1[batch], x2[batch]), y[batch]

In [7]:
def get_sequence_batches(data_split, syntactic_flag=True, batch_size=32):
    """turns data set into sequence of sentences (flattens multi paragraph samples into sequence)"""
    x, y = [], []
    for problem_id in data_split.keys():
        semantic = data_split[problem_id]['semantic']
        syntactic = data_split[problem_id]['syntactic']
        # concatenate all symantic and syntactic features into one vecntor per sample
        if syntactic_flag:
            texts = [np.concatenate([semantic[i], syntactic[i]]) for i in range(len(semantic))]
        else:
            texts = data_split[problem_id]['semantic']
        targets = data_split[problem_id]['truth']['changes']
        if len(texts) - 1 != len(targets):
            continue
        x.append(torch.tensor(texts))
        y.append(torch.tensor(targets))
    while True:
        perm = torch.randperm(len(x))
        for i in range(0, len(x), batch_size):
            batch = perm[i:i+batch_size]
            x_batch = [x[i] for i in batch]
            y_batch = [y[i] for i in batch]
            y_batch = torch.cat(y_batch, dim=0).to(device)
            # pad with zero vectors
            x_batch = pad_sequence(x_batch, batch_first=True, padding_value=0).to(device)
            x_batch = x_batch.float()
            yield x_batch, y_batch

## Models

We make our two models, the siamese network and the recurrent siamese network.

In [8]:
class SiameseNet(torch.nn.Module):
    def __init__(self, config, embed_dim):
        super().__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = config['hidden_dim']
        self.linear1 = torch.nn.Linear(self.embed_dim, self.hidden_dim)
        self.linear2 = torch.nn.Linear(self.hidden_dim, self.hidden_dim)
        self.linear3 = torch.nn.Linear(self.hidden_dim, 1)
        self.dropout = torch.nn.Dropout(config['dropout'])
        self.sigmoid = torch.nn.Sigmoid()
    
    def forward(self, x, y=None):
        x1, x2 = x
        x1_hat = self.dropout(x1)
        x1_hat = self.linear1(x1_hat)
        x1_hat = F.gelu(x1_hat)
        x2_hat = self.dropout(x2)
        x2_hat = self.linear1(x2_hat)
        x2_hat = F.gelu(x2_hat)
        y_hat = torch.abs(x1_hat - x2_hat)
        y_hat = self.linear2(y_hat)
        y_hat = F.gelu(y_hat)
        y_hat = self.dropout(y_hat)
        y_hat = self.linear3(y_hat)
        y_hat = self.sigmoid(y_hat)
        if y is not None:
            loss = torch.nn.functional.binary_cross_entropy(y_hat, y.float().unsqueeze(1))
            return y_hat, loss
        return y_hat

    def predict(self, x):
        y_hat = self.forward(x)
        y_hat = y_hat.squeeze(1)
        y_hat = (y_hat > 0.5).int()
        return y_hat

In [9]:
class RecurrentSiameseNet(torch.nn.Module):
    def __init__(self, config, embed_dim):
        super().__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = config['hidden_dim']
        self.linear1 = torch.nn.Linear(self.embed_dim, self.hidden_dim)
        self.gru = torch.nn.GRU(self.hidden_dim, self.hidden_dim, batch_first=True)
        self.linear2 = torch.nn.Linear(self.hidden_dim, 1)
        self.dropout = torch.nn.Dropout(config['dropout'])
    
    def forward(self, x, y=None):
        # x is tensor of shape (batch_size, seq_len, embed_dim)
        mask = self.x_mask(x)
        y_hat = self.dropout(x)
        y_hat = y_hat.reshape(-1, self.embed_dim)
        y_hat = self.linear1(y_hat) 
        y_hat = F.gelu(y_hat)
        y_hat = y_hat.reshape(-1, x.shape[1], self.hidden_dim)
        y_hat = self.gru(y_hat)[0]
        y_hat = y_hat.reshape(-1, self.hidden_dim)  # flatten for masking and y_hat
        y_hat = y_hat[mask]
        y_hat = F.gelu(y_hat)
        y_hat = self.linear2(y_hat)
        y_hat = F.sigmoid(y_hat)
        y_hat = y_hat.view(-1)
        if y is not None:
            try: 
                loss = torch.nn.functional.binary_cross_entropy(y_hat, y.float())
            except ValueError:
                return y_hat, None 
            return y_hat, loss
        return y_hat

    def x_mask(self, x):
        """returns mask of shape (batch_size, seq_len)"""
        mask = torch.sum(x, dim=2) != 0
        mask[:, 0] = False
        mask = mask.view(-1)
        return mask

    def predict(self, x):
        y_hat = self.forward(x)
        y_hat = (y_hat > 0.5).int()
        return y_hat

## Training

We define our training and evaluation functions, for use by both models, and all three datasets.

In [10]:
def training_curve(metrics):
    plt.style.use('dark_background')
    fig, axes = plt.subplots(1, 3, figsize=(16, 4))
    axes[0].plot(metrics['train_loss'], label='train')
    axes[0].plot(metrics['valid_loss'], label='val')
    axes[0].set_title('Loss')
    axes[0].legend()
    axes[1].plot(metrics['train_f1'], label='train')
    axes[1].plot(metrics['valid_f1'], label='val')
    axes[1].set_title('F1')
    axes[1].legend()
    plt.show()

In [11]:
def evaluate(metrics, model, train_batches, valid_batches, steps=10):
    for batch_name, batches in [('train', train_batches), ('valid', valid_batches)]:
        loss, f1, acc = evaluate_split(model, batches, steps=steps)
        metrics[batch_name + '_loss'].append(loss)
        metrics[batch_name + '_f1'].append(f1)
        metrics[batch_name + '_acc'].append(acc)
        model.eval()
    model.train()
    return metrics

def evaluate_split(model, batches, steps):
    f1_scores, losses, acc_scores = [], [], []
    for i in range(steps):
        x, y = next(batches)
        y_hat, loss = model(x, y)
        if loss is None:
            continue
        losses.append(loss.item())
        y_hat = model.predict(x).cpu().numpy().astype(int)
        y = y.cpu().numpy().astype(int)
        f1_scores.append(f1_score(y, y_hat))
        acc_scores.append(accuracy_score(y, y_hat))
    return np.mean(losses), np.mean(f1_scores), np.mean(acc_scores)

def train(model, optimizer, train_batches, valid_batches=None, batch_size=32, n_steps=1000):
    # returns metrics and final scores, if doing validation, else returns final model for testing
    metrics = {'train_loss': [], 'train_f1': [], 'train_acc': [], 'valid_loss': [], 'valid_f1': [], 'valid_acc': []}
    for i in range(n_steps):
        x, y = next(train_batches)
        y_hat, loss = model(x, y)
        if loss is None:  # there is an extremly rare bug where y_hat is one short of y FIXME.
            continue
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if valid_batches and i % (n_steps // 100) == 0:
            metrics = evaluate(metrics, model, train_batches, valid_batches)
    if valid_batches:
        final = evaluate(metrics, model, train_batches, valid_batches, steps=4200 // batch_size)
        return metrics, {k: v[-1] for k, v in final.items()}
    return model

## Experiment

To do hyper paramter tuning, and test performance of our models on our three datasets, we define the experiment functions.

In [12]:
def hyper_params():
    """return random hyperparameters"""
    return {
        'lr': 10 ** random.choice([-3, -4, -5]),
        'dropout': random.choice([0.1, 0.2, 0.3]),
        'hidden_dim': random.choice([32, 64, 128, 256]),
        'batch_size': random.choice([16, 32, 64]),
        'n_steps': random.choice([2000, 4000, 6000, 8000]),
    }

In [13]:
def hyper_param_search(model_fn, batch_fn, dataset, syntac_bool, n_trials=10):
    """search hyperparameters for a given model and dataset"""
    embed_dim = 384 + 61 if syntac_bool else 384
    hyper_param_metrics = []
    training_metrics_list = []
    for i in tqdm(range(n_trials)):
        config = hyper_params()
        model = model_fn(config, embed_dim=embed_dim).to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'])
        train_batches = batch_fn(dataset['train'], batch_size=config['batch_size'])
        valid_batches = batch_fn(dataset['valid'], batch_size=config['batch_size'])
        training_metrics, final= train(model, optimizer, train_batches, valid_batches, config['batch_size'], config['n_steps'])
        training_metrics_list.append(training_metrics)
        hyper_param_metrics.append({**config, **final})
    df = pd.DataFrame(hyper_param_metrics).sort_values('valid_f1', ascending=False)
    return df, training_metrics_list

In [14]:
def get_combinations():
    datasets = [(dataset_1, 'dataset_1'), (dataset_2, 'dataset_2'), (dataset_3, 'dataset_3')]
    models = [(SiameseNet, get_pair_batches), (RecurrentSiameseNet, get_sequence_batches)]
    syntac_bools = [True, False]
    return list(itertools.product(datasets, models, syntac_bools))

In [15]:
def experiment(n_trials=10):
    for (ds, ds_name), (model, batch_fn), syntax_bool in get_combinations():
        print(f'{model.__name__} on {ds_name} with {"syntax" if syntax_bool else "no syntax"}')
        batch_fn = partial(batch_fn, syntactic_flag=syntax_bool)
        df, training_metrics_list = hyper_param_search(model, batch_fn, ds, syntax_bool, n_trials=n_trials)
        df.round(4).to_csv(f'results/{model.__name__}_{ds_name}_{"syntax" if syntax_bool else "no_syntax"}_hyperparams.csv', index=False)
        pickle.dump(training_metrics_list, open(f'results/training_{model.__name__}_{ds_name}_{"syntax" if syntax_bool else "no_syntax"}.pkl', 'wb'))
        print(f'best f1 score: {df.iloc[0]["valid_f1"]}')
        print(f'best accuracy score: {df.iloc[0]["valid_acc"]}')
        print()        

In [21]:
# experiment(40)

In [91]:
def load_dfs():
    dfs = {}
    for ds_name in ['dataset_1', 'dataset_2', 'dataset_3']:
        for model in [RecurrentSiameseNet, SiameseNet]:
            file_name = f'results/{model.__name__}_{ds_name}.csv'
            df = pd.read_csv(file_name)
            dfs[(model.__name__, ds_name)] = df
    return dfs

## Testing

In [22]:
def merge_splits(datasets):
    """merge train and valid splits"""
    merged = {}
    for dataset in datasets:
        for problem_id in dataset.keys():
            merged[problem_id] = dataset[problem_id]
    return merged

def test_seq():
    results = []
    with open('config.yaml', 'r') as f:
        configs = yaml.load(f, Loader=yaml.FullLoader)
    for (ds, ds_name), (model, batch_fn), syntax_bool in tqdm(get_combinations()):
        batch_fn = partial(batch_fn, syntactic_flag=syntax_bool)
        config = configs[ds_name]['syntactic' if syntax_bool else 'semantic']['siamese' if model== SiameseNet else 'recurrent']
        train_data = merge_splits([ds['train'], ds['valid']])
        test_data = ds['test']
        embed_dim = 384 + 61 if syntax_bool else 384
        model = model({'hidden_dim': config['hidden_dim'], 'dropout': config['dropout']}, embed_dim=embed_dim).to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'])
        train_batches = batch_fn(train_data, batch_size=config['batch_size'])
        test_batches = batch_fn(test_data, batch_size=config['batch_size'])
        train(model, optimizer, train_batches, batch_size=config['batch_size'], n_steps=config['n_steps'])
        test_loss, test_f1, test_acc = evaluate_split(model, test_batches, steps=len(test_data) // config['batch_size'])
        train_loss, train_f1, train_acc = evaluate_split(model, train_batches, steps=len(train_data) // config['batch_size'])
        result = {'model': model.__class__.__name__, 'dataset': ds_name, 'syntax': syntax_bool,
                  'train_loss': train_loss, 'train_f1': train_f1, 'train_acc': train_acc,
                  'test_loss': test_loss, 'test_f1': test_f1, 'test_acc': test_acc}
        results.append(result)
    df = pd.DataFrame(results).round(4)
    df.to_csv('results/test_results.csv', index=False)
    return df

test_seq()

100%|██████████| 12/12 [02:31<00:00, 12.61s/it]


Unnamed: 0,model,dataset,syntax,loss,f1,acc
0,SiameseNet,dataset_1,True,0.2052,0.9456,0.904
1,SiameseNet,dataset_1,False,0.2504,0.9387,0.8906
2,RecurrentSiameseNet,dataset_1,True,0.2266,0.9484,0.9097
3,RecurrentSiameseNet,dataset_1,False,0.3786,0.9236,0.8603
4,SiameseNet,dataset_2,True,0.5845,0.62,0.6607
5,SiameseNet,dataset_2,False,0.6451,0.5579,0.6272
6,RecurrentSiameseNet,dataset_2,True,0.5447,0.6736,0.6968
7,RecurrentSiameseNet,dataset_2,False,0.5821,0.6059,0.6686
8,SiameseNet,dataset_3,True,0.6856,0.4886,0.5647
9,SiameseNet,dataset_3,False,0.777,0.5839,0.5692


## Analysis