# Multi Author Writing Style Analysis
by: Noah Syrkis

## Setup

In [25]:
from src.utils import get_data, get_paired_dataset
import torch
from torch import nn, optim
import torch.nn.functional as F
from collections import Counter, defaultdict
import numpy as np
from matplotlib import pyplot as plt
import warnings; warnings.simplefilter('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, balanced_accuracy_score, roc_auc_score
# make performance report
from sklearn.metrics import classification_report
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import wandb
api = wandb.Api()

In [26]:
# constants
dataset_names = [f'pan23-multi-author-analysis-dataset{i}' for i in range(1, 4)]
PAD = '<PAD>'  # id 0
UNK = '<UNK>'  # id 1
PAD_ID = 0
UNK_ID = 1
batch_size = 32
block_size = 8

In [3]:
# read in data
one_train_data = get_data(dataset_names[0], 'train')
one_valid_data = get_data(dataset_names[0], 'validation')
# two_train_data = get_data(dataset_names[1], 'train')
# two_valid_data = get_data(dataset_names[1], 'validation')
# three_train_data = get_data(dataset_names[2], 'train')
# three_valid_data = get_data(dataset_names[2], 'validation')

100%|██████████| 4200/4200 [00:03<00:00, 1355.93it/s]
100%|██████████| 900/900 [00:00<00:00, 1252.40it/s]


In [27]:
# functions
encode = lambda x, stoi: [stoi.get(w, UNK_ID) for w in x]
decode = lambda x, itos: ''.join([itos.get(i, UNK) for i in x])
flatten = lambda x: [i for j in x for i in j]

def make_word_vocab(dataset, min_count=50):
    # make a vocabulary from the training set
    vocab = Counter(flatten(flatten(dataset['text'])))
    vocab = [w for w, c in vocab.most_common() if c > min_count] + ['\n', ' '] + list('abcdefghijklmnopqrstuvwxyz') + list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
    vocab = [PAD, UNK] + sorted(list(set(vocab)))
    # return UNK if word is not in vocab
    stoi = defaultdict(lambda: 1, {w: i for i, w in enumerate(vocab)})
    itos = {i: w for i, w in enumerate(vocab)}
    return stoi, itos


def make_char_vocab(dataset, min_count=50):
    # make a vocabulary from the training set
    vocab = Counter(flatten(flatten(dataset['text'])))
    vocab = [w for w, c in vocab.most_common() if c > min_count] + ['\n', ' '] + list('abcdefghijklmnopqrstuvwxyz') + list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
    vocab = [PAD, UNK] + sorted(list(set(vocab)))
    # return UNK if word is not in vocab
    stoi = defaultdict(lambda: 1, {w: i for i, w in enumerate(vocab)})
    itos = {i: w for i, w in enumerate(vocab)}
    return stoi, itos

In [28]:
ctoi, itoc = make_char_vocab(one_train_data)  # character vocab
wtoi, itow = make_word_vocab(one_train_data)  # word vocab

## Baselines

In [29]:
base_one_train_data = get_paired_dataset(one_train_data)
base_one_valid_data = get_paired_dataset(one_valid_data)
chars = [PAD, UNK] + list('abcdefghijklmnopqrstuvwxyz') + list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') + ['\n', ' ', '.', ',', '!', '?', ':', ';', '"', "'", '(', ')', '-', '_', '/', '\\', '|', '[', ']', '{', '}', '@', '#', '$', '%', '^', '&', '*', '+', '=', '<', '>', '`', '~']
ctoi = {c: i for i, c in enumerate(chars)}
itoc = {i: c for i, c in enumerate(chars)}
encode_base = lambda x: [ctoi.get(c, UNK_ID) for c in x]
base_one_train_data[['doc1_idx', 'doc2_idx']] = base_one_train_data[['doc1', 'doc2']].applymap(encode_base)
base_one_valid_data[['doc1_idx', 'doc2_idx']] = base_one_valid_data[['doc1', 'doc2']].applymap(encode_base)

In [30]:
tfidf = TfidfVectorizer(max_features=200)
tfidf.fit(base_one_train_data['doc1'] + base_one_train_data['doc2'])
base_one_train_data['doc1_tfidf'] = tfidf.transform(base_one_train_data['doc1']).toarray().tolist()
base_one_train_data['doc2_tfidf'] = tfidf.transform(base_one_train_data['doc2']).toarray().tolist()
base_one_valid_data['doc1_tfidf'] = tfidf.transform(base_one_valid_data['doc1']).toarray().tolist()
base_one_valid_data['doc2_tfidf'] = tfidf.transform(base_one_valid_data['doc2']).toarray().tolist()

In [31]:
X_train = np.array(base_one_train_data['doc1_tfidf'].tolist()) - np.array(base_one_train_data['doc2_tfidf'].tolist())
y_train = base_one_train_data['change']
X_valid = np.array(base_one_valid_data['doc1_tfidf'].tolist()) - np.array(base_one_valid_data['doc2_tfidf'].tolist())
y_valid = base_one_valid_data['change']

In [32]:
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
model = xgb.XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    n_jobs=-1,
    reg_alpha=0.1,
    reg_lambda=10,
    )
model.fit(X_train, y_train)
pred_train = model.predict(X_train)
pred_valid = model.predict(X_valid)

In [33]:
def evaluate_predictions(name, y_true, y_pred):
    print(name)
    print('Weigthed F1:', f1_score(y_true, y_pred, average='weighted'))
    print()

evaluate_predictions('Train XGBoost', y_train, pred_train)
evaluate_predictions('Valid XGBoost', y_valid, pred_valid)
evaluate_predictions('Train ones', y_train, np.ones_like(y_train))
evaluate_predictions('Valid ones', y_valid, np.ones_like(y_valid))

Train XGBoost
Weigthed F1: 0.9195433929494176

Valid XGBoost
Weigthed F1: 0.8235611190705914

Train ones
Weigthed F1: 0.8228830172296793

Valid ones
Weigthed F1: 0.8047955216326438



## Models

In [149]:
@torch.no_grad()
def evaluate(model, train_data, val_data, batch_size, prev_losses, batch_fn):
    model.eval()
    losses = {}
    f1s = {}
    for split, data in [('train', train_data), ('valid', val_data)]:
        losses[split] = []
        f1s[split] = []
        for _ in range(10):
            x1s, x2s, ys = batch_fn(data, batch_size)
            ys_hat, loss = model(x1s, x2s, ys)
            f1 = f1_score(ys.flatten().numpy(), ys_hat.argmax(1).flatten().numpy(), average='weighted')
            losses[split].append(loss.item())
            f1s[split].append(f1)
        f1s[split] = np.mean(f1s[split])
        losses[split] = np.mean(losses[split])
    model.train()
    if prev_losses is not None and prev_losses['valid'] > losses['valid'] and prev_losses['train'] > losses['train']:
        torch.save(model.state_dict(), 'lm.pth')
    return losses, f1s


def train(model, train_data, valid_data, opt, conf, batch_fn, losses=None):
    wandb.init(project='mawsa', entity='syrkis', config=conf)
    # experiment.add_pytorch_models({'model': model})
    for i in range(conf['n_iters']):
        x1s, x2s, ys = batch_fn(train_data, conf['batch_size'])
        _, loss = model(x1s, x2s, ys)
        opt.zero_grad()
        loss.backward()
        opt.step()
        if i % (conf['n_iters'] // 50) == 0:
            losses, f1s = evaluate(model, train_data, valid_data, conf['batch_size'], losses, batch_fn)
            wandb.log({'train_loss': losses['train'], 'valid_loss': losses['valid'], 'train_f1': f1s['train'], 'valid_f1': f1s['valid']})

### GRU

#### Seperate samples

In [39]:
train_data = get_paired_dataset(one_train_data)
valid_data = get_paired_dataset(one_valid_data)
train_data['doc1'] = train_data['doc1'].apply(lambda x: encode(x, ctoi))
train_data['doc2'] = train_data['doc2'].apply(lambda x: encode(x, ctoi))
valid_data['doc1'] = valid_data['doc1'].apply(lambda x: encode(x, ctoi))
valid_data['doc2'] = valid_data['doc2'].apply(lambda x: encode(x, ctoi))


In [150]:
def pad_sequence(xs, max_len):
    return torch.tensor([x + [PAD_ID] * (max_len - len(x)) for x in xs])

def get_batch(data, batch_size):
    idxs = np.random.choice(len(data), batch_size)
    xs = data.iloc[idxs]
    ys = xs['change'].values
    xs = xs[['doc1', 'doc2']].values
    max_len = max([len(x) for x in xs.flatten().tolist()])
    xs = xs.flatten()
    xs = pad_sequence(xs, max_len)
    xs = xs.view(batch_size, 2, -1)
    ys = torch.tensor(ys)
    x1s = xs[:, 0, :]
    x2s = xs[:, 1, :]
    return x1s, x2s, ys
# get_batch(train_data, 12).shape

In [155]:
class GRUSiam(nn.Module):

    def __init__(self, vocab_size, emb_size, hidden_size, n_layers, dropout):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_size)
        self.gru = nn.GRU(emb_size, hidden_size, n_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x1, x2, y=None):
        x1 = self.emb(x1)
        x2 = self.emb(x2)
        x1, _ = self.gru(x1)
        x2, _ = self.gru(x2)
        x1 = x1[:, -1, :]
        x2 = x2[:, -1, :]
        x = torch.abs(x1 - x2)
        x = self.fc(x)
        if y is not None:
            loss = F.binary_cross_entropy_with_logits(x, y.view(-1, 1).float())
        else:
            loss = None
        return x, loss


In [156]:
conf = {'n_iters': 100, 'batch_size': 4, 'n_layers': 2, 'dropout': 0.1, 'emb_size': 32, 'hidden_size': 32}
model = GRUSiam(len(ctoi), conf['emb_size'], conf['hidden_size'], conf['n_layers'], conf['dropout']) 
opt = torch.optim.Adam(model.parameters(), lr=0.001)
train(model, train_data, valid_data, opt, conf, get_batch)

VBox(children=(Label(value='0.007 MB of 0.007 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_f1,▅▅▄▁▃▃█▄▃▃▄▃▃▄▅▄▄▃▃
train_loss,█▇▂▁▃▃▆▇▅▂▂▂▁▄▄▁▆▃▂
valid_f1,▁▃▃█▃▂▃▂▃▃▃▂▄▁▂▅▃▂▃
valid_loss,█▆▆▆▇▅▅▄▆▂▄▄▇▃▅▆▄▁█

0,1
train_f1,0.04
train_loss,0.55869
valid_f1,0.07
valid_loss,0.66059


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01673384159997416, max=1.0)…