# Multi Author Writing Style Analysis

The following notebook presents three different approaches to the problem of distingushing when in a sequence of paragraphs,
the author changes. The first approach disregrads the order of the paragraphs, opting instead to view samples as pairs of paragraphs.
It processes the paragraphs with a siamese network, which is a neural network that takes two inputs and outputs a single value.
The second approach adds a recurrent layer to the siamese network, allowing it to take into account a sequence of paragraphs.
The third approach builds on the second by augmenting the input with a manually engineered feature vector.

In [1]:
import numpy as np
import random
import pickle
import torch
import warnings
from functools import partial
from torch.nn import functional as F
from torch.nn.utils.rnn import pad_sequence
import itertools
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn.metrics import f1_score, accuracy_score
import pandas as pd
import nltk
import yaml
from src.utils import get_data

In [2]:
warnings.filterwarnings('ignore')


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Data

We load the data (uncomment to reconstruct the data from the raw files),
and make two data batch loaders:

1. pairs of paragraphs, which will be used for our baseline siamese network.
2. Sequences of paragraphs, which will be used for our recurrent siamese network.


In [4]:
# nltk.download('averaged_perceptron_tagger')
# data = { str(i): get_data(i) for i in range(1, 4) }
# pickle.dump(data, open('data/data.pkl', 'wb'))
data = pickle.load(open('data/data.pkl', 'rb'))
dataset_1, dataset_2, dataset_3 = data['1'], data['2'], data['3']

In [5]:
def paired_samples(data_split, modality):
    """turns data set into pair of consectuve sentences (flattens multi paragraph samples into pairs)"""
    pairs = []
    for problem_id in data_split.keys():
        semantic = data_split[problem_id]['semantic']
        syntactic = data_split[problem_id]['syntactic']
        # concatenate all symantic and syntactic features into one vecntor per sample
        if modality == 'both':
            texts = [np.concatenate([semantic[i], syntactic[i]]) for i in range(len(semantic))]
        elif modality == 'semantic':
            texts = data_split[problem_id]['semantic']
        elif modality == 'syntactic':
            texts = data_split[problem_id]['syntactic']
        else:
            raise ValueError(f'invalid modality {modality}')
        targets = data_split[problem_id]['truth']['changes']
        if len(texts) - 1 != len(targets):
            # TODO: fix. a few of the samples have more than one paragraph, making .readlines() wrong
            # print(f'problem {problem_id} has {len(texts)} texts and {len(targets)} targets')
            continue
        for target, text1, text2 in zip(targets, texts[:-1], texts[1:]):
            pairs.append((text1, text2, target))
    random.shuffle(pairs)
    return pairs

In [6]:
def get_pair_batches(data_split, modality, batch_size=32):
    pairs = paired_samples(data_split, modality)
    while True:
        # perm = np.random.permutation(len(pairs))
        x1 = torch.tensor(np.array([p[0] for p in pairs])).float().to(device)
        x2 = torch.tensor(np.array([p[1] for p in pairs])).float().to(device)
        y = torch.tensor(np.array([p[2] for p in pairs])).float().to(device)
        perm = torch.randperm(len(pairs))
        for i in range(0, len(pairs), batch_size):
            batch = perm[i:i+batch_size]
            yield (x1[batch], x2[batch]), y[batch]

In [7]:
def get_sequence_batches(data_split, modality, batch_size=32):
    """turns data set into sequence of sentences (flattens multi paragraph samples into sequence)"""
    x, y = [], []
    for problem_id in data_split.keys():
        semantic = data_split[problem_id]['semantic']
        syntactic = data_split[problem_id]['syntactic']
        # concatenate all symantic and syntactic features into one vecntor per sample
        if modality == 'both':
            texts = [np.concatenate([semantic[i], syntactic[i]]) for i in range(len(semantic))]
        elif modality == 'syntactic':
            texts = data_split[problem_id]['syntactic']
        elif modality == 'semantic':
            texts = data_split[problem_id]['semantic']
        else:
            raise ValueError(f'invalid modality {modality}')
        targets = data_split[problem_id]['truth']['changes']
        if len(texts) - 1 != len(targets):
            continue
        x.append(torch.tensor(texts))
        y.append(torch.tensor(targets))
    while True:
        perm = torch.randperm(len(x))
        for i in range(0, len(x), batch_size):
            batch = perm[i:i+batch_size]
            x_batch = [x[i] for i in batch]
            y_batch = [y[i] for i in batch]
            y_batch = torch.cat(y_batch, dim=0).to(device)
            # pad with zero vectors
            x_batch = pad_sequence(x_batch, batch_first=True, padding_value=0).to(device)
            x_batch = x_batch.float()
            yield x_batch, y_batch

## Models

We make our two models, the siamese network and the recurrent siamese network.

In [8]:
def soft_f1_loss(y_pred, y_true):
    """computes soft f1 loss"""
    tp = (y_true * y_pred).sum(dim=1)
    tn = ((1 - y_true) * (1 - y_pred)).sum(dim=1)
    fp = ((1 - y_true) * y_pred).sum(dim=1)
    fn = (y_true * (1 - y_pred)).sum(dim=1)
    epsilon = 1e-7
    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)
    f1 = 2 * precision * recall / (precision + recall + epsilon)
    return 1 - f1.mean()

In [9]:
class SiameseNet(torch.nn.Module):
    def __init__(self, config, embed_dim):
        super().__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = config['hidden_dim']
        self.linear1 = torch.nn.Linear(self.embed_dim, self.hidden_dim)
        self.linear2 = torch.nn.Linear(self.hidden_dim, self.hidden_dim)
        self.linear3 = torch.nn.Linear(self.hidden_dim, 1)
        self.dropout = torch.nn.Dropout(config['dropout'])
        self.sigmoid = torch.nn.Sigmoid()
    
    def forward(self, x, y=None):
        x1, x2 = x
        x1_hat = self.dropout(x1)
        x1_hat = self.linear1(x1_hat)
        x1_hat = F.gelu(x1_hat)
        x2_hat = self.dropout(x2)
        x2_hat = self.linear1(x2_hat)
        x2_hat = F.gelu(x2_hat)
        y_hat = torch.abs(x1_hat - x2_hat)
        y_hat = self.linear2(y_hat)
        y_hat = F.gelu(y_hat)
        y_hat = self.dropout(y_hat)
        y_hat = self.linear3(y_hat)
        y_hat = self.sigmoid(y_hat)
        if y is not None:
            # loss = torch.nn.functional.binary_cross_entropy(y_hat, y.float().unsqueeze(1))
            # switch to soft f1 loss
            # bce loss
            loss = torch.nn.functional.binary_cross_entropy(y_hat, y.float().unsqueeze(1))
            return y_hat, loss
        return y_hat

    def predict(self, x):
        y_hat = self.forward(x)
        y_hat = y_hat.squeeze(1)
        y_hat = (y_hat > 0.5).int()
        return y_hat

In [27]:
class RecurrentNet(torch.nn.Module):
    def __init__(self, config, embed_dim):
        super().__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = config['hidden_dim']
        self.linear1 = torch.nn.Linear(self.embed_dim, self.hidden_dim)
        self.gru = torch.nn.GRU(self.hidden_dim, self.hidden_dim, batch_first=True)
        self.linear2 = torch.nn.Linear(self.hidden_dim, 1)
        self.dropout = torch.nn.Dropout(config['dropout'])
    
    def forward(self, x, y=None):
        # x is tensor of shape (batch_size, seq_len, embed_dim)
        mask = self.x_mask(x)
        y_hat = self.dropout(x)
        y_hat = y_hat.reshape(-1, self.embed_dim)
        y_hat = self.linear1(y_hat) 
        y_hat = F.gelu(y_hat)
        y_hat = y_hat.reshape(-1, x.shape[1], self.hidden_dim)
        # y_hat = y_hat[:, 1:, :] - y_hat[:, :-1, :]  # take difference between consecutive sentences (Siamese net)
        y_hat = self.gru(y_hat)[0]
        y_hat = y_hat.reshape(-1, self.hidden_dim)  # flatten for masking and y_hat
        y_hat = y_hat[mask]
        y_hat = F.gelu(y_hat)
        y_hat = self.linear2(y_hat)
        y_hat = F.sigmoid(y_hat)
        y_hat = y_hat.squeeze(1)
        if y is not None:
            try: 
                # soft f1 loss
                # loss = soft_f1_loss(y_hat, y.float())
                # bce loss
                loss = torch.nn.functional.binary_cross_entropy(y_hat, y.float())
            except ValueError:
                return y_hat, None 
            return y_hat, loss
        return y_hat

    def x_mask(self, x):
        """returns mask of shape (batch_size, seq_len)"""
        mask = torch.sum(x, dim=2) != 0
        mask[:, 0] = False
        mask = mask.view(-1)
        return mask

    def predict(self, x):
        y_hat = self.forward(x)
        y_hat = (y_hat > 0.5).int()
        return y_hat

## Training

We define our training and evaluation functions, for use by both models, and all three datasets.

In [28]:
def training_curve(metrics):
    plt.style.use('dark_background')
    fig, axes = plt.subplots(1, 3, figsize=(16, 4))
    axes[0].plot(metrics['train_loss'], label='train')
    axes[0].plot(metrics['valid_loss'], label='val')
    axes[0].set_title('Loss')
    axes[0].legend()
    axes[1].plot(metrics['train_f1'], label='train')
    axes[1].plot(metrics['valid_f1'], label='val')
    axes[1].set_title('F1')
    axes[1].legend()
    plt.show()

In [29]:
def evaluate(metrics, model, train_batches, valid_batches, steps=10):
    for batch_name, batches in [('train', train_batches), ('valid', valid_batches)]:
        loss, f1, acc = evaluate_split(model, batches, steps=steps)
        metrics[batch_name + '_loss'].append(loss)
        metrics[batch_name + '_f1'].append(f1)
        metrics[batch_name + '_acc'].append(acc)
        model.eval()
    model.train()
    return metrics

def evaluate_split(model, batches, steps):
    f1_scores, losses, acc_scores = [], [], []
    for i in range(steps):
        x, y = next(batches)
        y_hat, loss = model(x, y)
        if loss is None:
            continue
        losses.append(loss.item())
        y_hat = model.predict(x).cpu().numpy().astype(int)
        y = y.cpu().numpy().astype(int)
        f1_scores.append(f1_score(y, y_hat))
        acc_scores.append(accuracy_score(y, y_hat))
    return np.mean(losses), np.mean(f1_scores), np.mean(acc_scores)

def train(model, optimizer, train_batches, valid_batches=None, batch_size=32, n_steps=1000):
    # returns metrics and final scores, if doing validation, else returns final model for testing
    metrics = {'train_loss': [], 'train_f1': [], 'train_acc': [], 'valid_loss': [], 'valid_f1': [], 'valid_acc': []}
    for i in range(n_steps):
        x, y = next(train_batches)
        y_hat, loss = model(x, y)
        if loss is None:  # there is an extremly rare bug where y_hat is one short of y FIXME.
            continue
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if valid_batches and i % (n_steps // 100) == 0:
            metrics = evaluate(metrics, model, train_batches, valid_batches)
    if valid_batches:
        final = evaluate(metrics, model, train_batches, valid_batches, steps=4200 // batch_size)
        return metrics, {k: v[-1] for k, v in final.items()}
    return model

## Experiment

To do hyper paramter tuning, and test performance of our models on our three datasets, we define the experiment functions.

In [30]:
def hyper_params():
    """return random hyperparameters"""
    return {
        'lr': 10 ** random.choice([-3, -4, -5]),
        'dropout': random.choice([0.1, 0.2, 0.3]),
        'hidden_dim': random.choice([32, 64, 128, 256]),
        'batch_size': random.choice([16, 32, 64]),
        'n_steps': random.choice([2000, 4000, 6000, 8000]),
    }

In [31]:
def hyper_param_search(model_fn, batch_fn, dataset, modality, n_trials=10):
    """search hyperparameters for a given model and dataset"""
    if modality == 'both':
        embed_dim = 384 + 61
    elif modality == 'semantic':
        embed_dim = 384
    elif modality == 'syntactic':
        embed_dim = 61
    else:
        raise ValueError(f'invalid modality {modality}')
    hyper_param_metrics = []
    training_metrics_list = []
    for i in tqdm(range(n_trials)):
        config = hyper_params()
        model = model_fn(config, embed_dim=embed_dim).to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'])
        train_batches = batch_fn(dataset['train'], batch_size=config['batch_size'])
        valid_batches = batch_fn(dataset['valid'], batch_size=config['batch_size'])
        training_metrics, final= train(model, optimizer, train_batches, valid_batches, config['batch_size'], config['n_steps'])
        training_metrics_list.append(training_metrics)
        hyper_param_metrics.append({**config, **final})
    df = pd.DataFrame(hyper_param_metrics).sort_values('valid_f1', ascending=False)
    return df, training_metrics_list

In [32]:
def get_combinations():
    datasets = [(dataset_2, 'dataset_2'), (dataset_3, 'dataset_3'), (dataset_1, 'dataset_1')]
    models = [(RecurrentNet, get_sequence_batches), (SiameseNet, get_pair_batches)]
    syntac_bools = ['both', 'semantic', 'syntactic']
    return list(itertools.product(datasets, models, syntac_bools))

In [35]:
def experiment(n_trials=10):
    for (ds, ds_name), (model, batch_fn), modality in get_combinations():
        print(f'{model.__name__} on {ds_name} with {modality}')
        batch_fn = partial(batch_fn, modality=modality)
        df, training_metrics_list = hyper_param_search(model, batch_fn, ds, modality, n_trials=n_trials)
        df.round(4).to_csv(f'results/true_siam_{model.__name__}_{ds_name}_{modality}.csv')
        pickle.dump(training_metrics_list, open(f'results/true_siam_training_{model.__name__}_{ds_name}_{modality}.pkl', 'wb'))
        print(f'best f1 score: {df.iloc[0]["valid_f1"]}')
        print(f'best accuracy score: {df.iloc[0]["valid_acc"]}')
        print()        

In [37]:
experiment(1)

RecurrentNet on dataset_2 with both


100%|██████████| 1/1 [00:59<00:00, 59.82s/it]


best f1 score: 0.5463287977288087
best accuracy score: 0.6253650212219702

RecurrentNet on dataset_2 with semantic


  0%|          | 0/1 [00:04<?, ?it/s]


KeyboardInterrupt: 

In [None]:
def top_rows_for_each_dataset():
    out = {'dataset_1': [], 'dataset_2': [], 'dataset_3': []}
    for (ds, ds_name), (model, batch_fn), modality in get_combinations():
        df = pd.read_csv(f'results/true_siam_{model.__name__}_{ds_name}_{modality}.csv')
        row = df.iloc[0]
        meta = {'model': model.__name__, 'modality': modality}
        out[ds_name].append({**meta, **row})
    for ds_name in out.keys():
        df = pd.DataFrame(out[ds_name])
        df = df.sort_values('valid_f1', ascending=False)
        df.round(4).to_csv(f'results/true_siam_top_{ds_name}_hyperparams.csv', index=False)
    out = {k: pd.DataFrame(v) for k, v in out.items()}
    return out
hyperparam_search_result = top_rows_for_each_dataset()

        


In [105]:
def generate_config_file(best_row_for_each_mode_and_dataset):
    """generate a config file with the best hyper paramters for each model and dataset (have it all in one file)"""
    config = {}
    for ds_name in best_row_for_each_mode_and_dataset.keys():
        df = best_row_for_each_mode_and_dataset[ds_name]
        for i, row in df.iterrows():
            hyper_params = {'lr': row['lr'],
                            'dropout': row['dropout'],
                            'hidden_dim': int(row['hidden_dim']),
                            'batch_size': int(row['batch_size']),
                            'n_steps': int(row['n_steps'])}
            model = row['model']
            modality = row['modality']
            config[f'{ds_name}_{model}_{modality}'] = hyper_params
    with open('config.yaml', 'w') as f:
        yaml.dump(config, f)

generate_config_file(hyperparam_search_result)

## Testing

In [106]:
def merge_splits(datasets):
    """merge train and valid splits"""
    merged = {}
    for idx, dataset in enumerate(datasets):
        for problem_id in dataset.keys():
            merged[str(idx) + '_' + problem_id] = dataset[problem_id]
    return merged

def test_seq():
    results = []
    torch.manual_seed(42)
    with open('config.yaml', 'r') as f:
        configs = yaml.load(f, Loader=yaml.FullLoader)
    for (ds, ds_name), (model, batch_fn), modality in tqdm(get_combinations()):
        batch_fn = partial(batch_fn, modality=modality)
        config = configs[f'{ds_name}_{model.__name__}_{modality}']
        train_data = merge_splits([ds['train'], ds['valid']])
        test_data = ds['test']

        if modality == 'both':
            embed_dim = 384 + 61
        elif modality == 'semantic':
            embed_dim = 384
        elif modality == 'syntactic':
            embed_dim = 61
        else:
            raise ValueError(f'invalid modality {modality}')

        model = model({'hidden_dim': config['hidden_dim'], 'dropout': config['dropout']}, embed_dim=embed_dim).to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'])
        train_batches = batch_fn(train_data, batch_size=config['batch_size'])
        test_batches = batch_fn(test_data, batch_size=config['batch_size'])
        train(model, optimizer, train_batches, batch_size=config['batch_size'], n_steps=config['n_steps'])
        test_loss, test_f1, test_acc = evaluate_split(model, test_batches, steps=len(test_data) // config['batch_size'])
        train_loss, train_f1, train_acc = evaluate_split(model, train_batches, steps=len(train_data) // config['batch_size'])
        n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        result = {'model': model.__class__.__name__, 'dataset': ds_name, 'modality': modality,
                  'train_loss': train_loss, 'train_f1': train_f1, 'train_acc': train_acc,
                  'test_loss': test_loss, 'test_f1': test_f1, 'test_acc': test_acc,
                'n_params': n_params}
        results.append(result)
    df = pd.DataFrame(results).round(4)
    df.to_csv('results/true_siam_test_results.csv', index=False)
    return df
test_seq()

100%|██████████| 18/18 [05:23<00:00, 17.97s/it]


Unnamed: 0,model,dataset,modality,train_loss,train_f1,train_acc,test_loss,test_f1,test_acc,n_params
0,RecurrentSiameseNet,dataset_2,both,0.4996,0.711,0.7344,0.5539,0.6496,0.6983,20641
1,RecurrentSiameseNet,dataset_2,semantic,0.5842,0.6588,0.6714,0.6066,0.615,0.6501,18689
2,RecurrentSiameseNet,dataset_2,syntactic,0.5615,0.6761,0.6911,0.5757,0.6486,0.6796,107137
3,SiameseNet,dataset_2,both,0.5632,0.6789,0.6627,0.5719,0.6677,0.6652,73729
4,SiameseNet,dataset_2,semantic,0.4926,0.688,0.7502,0.6327,0.5726,0.6585,28865
5,SiameseNet,dataset_2,syntactic,0.5765,0.6589,0.6668,0.5829,0.606,0.6406,24577
6,RecurrentSiameseNet,dataset_3,both,0.6453,0.5927,0.6248,0.6632,0.5774,0.6154,156289
7,RecurrentSiameseNet,dataset_3,semantic,0.6547,0.5506,0.6067,0.6731,0.5254,0.5801,18689
8,RecurrentSiameseNet,dataset_3,syntactic,0.6689,0.5301,0.5927,0.6752,0.5051,0.5772,410881
9,SiameseNet,dataset_3,both,0.6642,0.4908,0.5992,0.6668,0.4534,0.5826,32769


## Analysis