# Statement Verification in Tabular Data

## Installing dependencies:

In [None]:
# Check torch version and whether all required packages are already installed, if not then run the next cell
!pip freeze | grep ^torch==
!pip freeze | grep ^transformers==
!pip freeze | grep ^datasets==
!pip freeze | grep ^torch-scatter==

In [None]:
! pip install torch==1.7.0+cu101 torchvision==0.8.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
!pip install transformers==4.1.1 
!pip install datasets

In [None]:
# Ensure that torch-scatter has torch version and CUDA (cu101) the same as the installed version of PyTorch
!pip install --no-index --verbose torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cu101.html

## Connecting to Google Drive:

In [None]:
# Mount GDrive
from google.colab import drive
drive.mount("/content/drive")

## Reset

In [None]:
%reset -f

## Set TAPAS Version

In [None]:
PATH_ROOT = "/content/drive/MyDrive/SemTabFact/csv_aug_data/"
PATH_CSV = PATH_ROOT

In [None]:
TAPAS_SMALL = "google/tapas-small-finetuned-tabfact" # 117 MB
TAPAS_MEDIUM = "google/tapas-medium-finetuned-tabfact" # 168 MB
TAPAS_BASE  = "google/tapas-base-finetuned-tabfact"  # 443 MB
TAPAS_LARGE = "google/tapas-large-finetuned-tabfact" # 1.35 GB

TAPAS_VERSION = TAPAS_BASE # The version used in the rest of the notebook

## Preparing the custom dataset and DataLoaders

In [None]:
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
from torch.utils.data import Dataset, DataLoader
from transformers import TapasTokenizer, TapasForSequenceClassification

In [None]:
class TableDataset(Dataset):
    """ 
        Custom dataset for TAPAS classification
        References: 
        https://huggingface.co/transformers/model_doc/tapas.html#usage-fine-tuning
        https://pytorch.org/tutorials/beginner/data_loading_tutorial.html 
    """

    def __init__(self, csv_root_path, dataframe, tokenizer):
        self.csv_root_path = csv_root_path
        self.dataframe = dataframe
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        """ Returns the ith batch, dataset[i] can be used directly """
        
        # Get the ith row from the given df:
        item = self.dataframe.iloc[idx]

        # Convert to full path
        table_path = self.csv_root_path + item.table_name

        # Read the CSV as a DF
        table = pd.read_csv(
            table_path,
            header = None,
            index_col = None
        )

        # Convert everything to str as TapasTokenizer expects evrything to be in str format, even the column headers
        table = table.astype(str)
        table.columns = table.columns.astype(str)
        statement = str(item.statement)

        # Create the inputs to be fed into the model: https://huggingface.co/transformers/model_doc/tapas.html#transformers.TapasTokenizer
        inputs = self.tokenizer(
            table = table,
            queries = statement, # We feed a single statement in a single sample
            truncation = True, # Important if you want to use batch_size > 1, this truncates the table such that the vector representation is 512 dimension
            padding = "max_length", # Pad to 512, uses the [PAD] token
            return_tensors = "pt" # Return PyTorch tensors
        )

        # Remove the extra dimension which the tokenizer adds by default
        inputs = {key: val.squeeze(0) for key, val in inputs.items()} # This isn't the batch dimension, but an extra redundant dimension
        
        inputs["label"] = item.label

        # For verification, we also add the ID
        inputs["id"] = int(item.name)

        return inputs 

    def __len__(self):
        """ Returns the length of the dataset """
        return len(self.dataframe)

In [None]:
def getDataLoader(csv_root_path, df, tokenizer, batch_size):
    """ Returns the DataLoader used for training/finetuning/validation """

    dataset = TableDataset(
        csv_root_path = csv_root_path,
        dataframe = df, 
        tokenizer = tokenizer
    )

    dataloader = DataLoader(
        dataset, 
        shuffle = False,
        batch_size = batch_size
    )

    return dataloader

In [None]:
def loadDF(path):
    """ Returns the main DF containing the filenames, statements and labels """
    df = pd.read_csv(
        path, 
        index_col = 0 # Use the id column as the index
    )
    df.index.name = "id" 
    return df

## Loading the pre-trained TAPAS model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# See which GPU has been allotted 
print(torch.cuda.get_device_name(torch.cuda.current_device()))

In [None]:
def countParameters(model):
    """ Counts the total number of trainable and frozen parameters in the model """
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    frozen = sum(p.numel() for p in model.parameters() if not p.requires_grad)
    return trainable, frozen

In [None]:
def loadModel(tapas_version, n_classes = 3):
    """ Returns the pre-trained tapas model """
    model = TapasForSequenceClassification.from_pretrained(tapas_version)
    # Modify the pre-trained model
    model.num_labels = n_classes
    model.config.num_labels = n_classes
    # Add a completely new classifer
    model.classifier = torch.nn.Linear(
        in_features = model.config.hidden_size, 
        out_features = n_classes, 
        bias = True
    )
    return model

## Training/Finetuning

### Load the custom Tokenizer

In [None]:
# Instantiate the tokenizer:
tokenizer = TapasTokenizer.from_pretrained(TAPAS_VERSION) # This is only used for tokenizing the Tables, and has no effect on the statement label

### Instantiating the data:

In [None]:
# PATH_DATA contains path to a CSV containing the columns (table_file, statement)
PATH_DF = f"{PATH_CSV}data_merged.csv"
df = loadDF(PATH_DF)
display(df)

In [None]:
# Check whether the classes are imbalanced or not
print("Class distribution:\n{}".format(df["label"].value_counts()))

In [None]:
from sklearn.model_selection import train_test_split as tts
df_train, df_val = tts(df, train_size = 0.8, random_state = 42, shuffle = True) # We shuffle the data here as well as merged data has autotrained and manual segregated
print(len(df_train), len(df_val))

In [None]:
PATH_CSV_ROOT = PATH_CSV
BATCH_SIZE_TRAIN = 32
BATCH_SIZE_VAL = 256 # Doesn't really matter as we don't update and weights here, but RAM shouldn't crash
train_dataloader = getDataLoader(PATH_CSV_ROOT, df_train, tokenizer, BATCH_SIZE_TRAIN)
val_dataloader = getDataLoader(PATH_CSV_ROOT, df_val, tokenizer, BATCH_SIZE_VAL)

### Main methods:

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

def computeMetrics(y_true, y_pred):
    """ Computes various accuracies, expects y_true and y_pred to be on CPU. f1_micro is same as accuracy, thus we calculate class-wise metrics """
    acc = accuracy_score(y_true = y_true, y_pred = y_pred)
    f1 = f1_score(y_true = y_true, y_pred = y_pred, average = None)
    precision = precision_score(y_true = y_true, y_pred = y_pred, average = None)
    recall = recall_score(y_true = y_true, y_pred = y_pred, average = None)
    return acc, f1, precision, recall

In [None]:
def train(model, dataloader, optimizer):
    """ Trains the model on the given training set and returns the loss and accuracy """

    total_epoch_loss = 0
    y_true_epoch = []
    y_pred_epoch = []
    
    model.train() # Put the model in training mode

    for batch in tqdm(dataloader, desc = "Training: "):

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        true_labels = batch["label"].to(device) # A [32] torch tensor, integer encoded, this is expected by the model
    
        optimizer.zero_grad() # Zero the previous gradients
        
        outputs = model(
            input_ids = input_ids, 
            attention_mask = attention_mask, 
            token_type_ids = token_type_ids, 
            labels = true_labels # [32]
        )
        
        loss = outputs.loss 
        logits = outputs.logits # [32, 3]

        model_predictions = logits.argmax(-1) # Takes argmax along the last axis [32, 3] -> [32], the problem is not multilabel, thus threshold doesn't matter
        
        loss.backward() # Compute gradients
        optimizer.step() # Make the updates
        
        total_epoch_loss += loss.item()
        y_true_epoch += true_labels
        y_pred_epoch += model_predictions
    
    avg_epoch_loss = total_epoch_loss/len(dataloader)
    # Convert predictions (list of PyTorch tensors to a vanilla list on the CPU)
    # This works fine as max list size in python on a 32 bit machine is 536,870,912 elements 
    y_true_epoch = torch.tensor(y_true_epoch).tolist()
    y_pred_epoch = torch.tensor(y_pred_epoch).tolist()
    acc, f1, precision, recall = computeMetrics(y_true = y_true_epoch, y_pred = y_pred_epoch) 
    metrics = {
        "acc" : acc,
        "f1" : f1,
        "precision" : precision,
        "recall" : recall
    }
        
    return avg_epoch_loss, metrics

In [None]:
def validate(model, dataloader, optimizer):
    """ Evaluates the model on the given validation set and returns the loss and accuracy """

    total_epoch_loss = 0
    y_true_epoch = []
    y_pred_epoch = []
    
    model.eval() # Put the model in validation mode

    with torch.no_grad():
        for batch in tqdm(dataloader, desc= "Validation: "):

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)
            true_labels = batch["label"].to(device) # [32]
            
            outputs = model(
                input_ids = input_ids, 
                attention_mask = attention_mask, 
                token_type_ids = token_type_ids, 
                labels = true_labels
            )
            
            loss = outputs.loss
            logits = outputs.logits # [32, 3]

            model_predictions = logits.argmax(-1) # [32]
            true_predictions = true_labels # [32]
                    
            total_epoch_loss += loss.item()
            y_true_epoch += true_labels
            y_pred_epoch += model_predictions
    
    avg_epoch_loss = total_epoch_loss/len(dataloader)
    # Convert predictions (list of PyTorch tensors to a vanilla list on the CPU)
    # This works fine as max list size in python on a 32 bit machine is 536,870,912 elements 
    y_true_epoch = torch.tensor(y_true_epoch).tolist()
    y_pred_epoch = torch.tensor(y_pred_epoch).tolist()
    acc, f1, precision, recall = computeMetrics(y_true = y_true_epoch, y_pred = y_pred_epoch) 
    metrics = {
        "acc" : acc,
        "f1" : f1,
        "precision" : precision,
        "recall" : recall
    }
     
    return avg_epoch_loss, metrics

###  Training and validation:

#### Set up the TAPAS pre-trained on binary data

In [None]:
model = loadModel(TAPAS_VERSION)
# Put the model on the GPU
model = model.to(device)

# or, load pre-trained
# model = torch.load("/content/drive/MyDrive/SemTabFact/tapas-base-3-classes-epoch-2-no-meta-941val.h5")

In [None]:
# Freeze the entire model
for param in model.parameters():
    param.requires_grad = False

In [None]:
n_trainable, n_frozen = countParameters(model)
print(f"The model has {n_trainable:,} trainable parameters and {n_frozen:,} frozen parameters")

In [None]:
# Encoder layers to unfreeze:
enc_layers = [-1, -2, -3] # Unfreeze a few encoders from the end
for i in enc_layers:
    for param in model.tapas.encoder.layer[i].parameters():
        param.requires_grad = True

# Unfreeze pooler
for param in model.tapas.pooler.parameters():
    param.requires_grad = True

# Unfreeze dropout and classifer:
for param in model.dropout.parameters():
    param.requires_grad = True

for param in model.classifier.parameters():
    param.requires_grad = True

Use `print(model)` to see the model's architecture  
Use `print(model.config)` to see the model's configuration  

In [None]:
n_trainable, n_frozen = countParameters(model)
print(f"The model has {n_trainable:,} trainable parameters and {n_frozen:,} frozen parameters")

#### Main loop:

In [None]:
from transformers import AdamW

LR = 5e-5 # Recommended: 5e-5, 3e-5, 2e-5
N_EPOCHS = 1 # Recommended: 2, 3, 4

optimizer = AdamW(
    model.parameters(),
    lr = LR,
    eps = 1e-8 # prevents division by 0
)

history = {
    "t_loss" : [],
    "t_acc" : [],
    "t_f1" : [],
    "t_prec" : [],
    "t_recall" : [],

    "v_loss" : [],
    "v_acc" : [],
    "v_f1" : [],
    "v_prec" : [],
    "v_recall" : [],
}

In [None]:
for epoch in range(N_EPOCHS):

    print(f"Epoch: {epoch+1:02}")

    train_loss, train_metrics = train(model, train_dataloader, optimizer)
    t_acc = train_metrics["acc"]
    t_f1 = train_metrics["f1"]
    t_prec = train_metrics["precision"]
    t_recall = train_metrics["recall"]
    print(f"Train | Loss: {train_loss:.3f} | Accuracy: {t_acc:.3f} | F1: {t_f1} | Precision: {t_prec} | Recall: {t_recall}")

    # Save the model post training, don't wait for validation (incase Colab times out in between)
    # print("Saving the model ...\n")
    # model_save_path = f"/content/drive/MyDrive/SemTabFact/tapas-base-3-classes-epoch-3-no-meta.h5"
    # torch.save(model, model_save_path)

    val_loss, val_metrics = validate(model, val_dataloader, optimizer)
    v_acc = val_metrics["acc"]
    v_f1 = val_metrics["f1"]
    v_prec = val_metrics["precision"]
    v_recall = val_metrics["recall"]
    print(f"Validation |  Loss: {val_loss:.3f} | Accuracy: {v_acc:.3f} | F1: {v_f1} | Precision: {v_prec} | Recall: {v_recall}")

    print("\n")

    history["t_loss"].append(train_loss)
    history["t_acc"].append(t_acc)
    history["t_f1"].append(t_f1)
    history["t_prec"].append(t_prec)
    history["t_recall"].append(t_recall)

    history["v_loss"].append(val_loss)
    history["v_acc"].append(v_acc)
    history["v_f1"].append(v_f1)
    history["v_prec"].append(v_prec)
    history["v_recall"].append(v_recall)

## Plot Metrics

In [None]:
t_acc = history["t_acc"]
t_loss = history["t_loss"]
v_acc = history["v_acc"]
v_loss = history["v_loss"]

epochs = range(1, N_EPOCHS + 1)

plt.plot(epochs, t_acc)
plt.plot(epochs, v_acc)
plt.title("Accuracy")

plt.figure()
plt.plot(t_loss)
plt.plot(v_loss)
plt.title("Loss")

plt.show()