# RATIO 2019 - Benchmarking Workshop

https://github.com/zihangdai/xlnet/blob/master/notebooks/colab_imdb_gpu.ipynb

### Setup

In [None]:
! conda env create -f environment.yml
! conda activate argmining19-ssc && conda install -y -c conda-forge git-lfs && git lfs install && git lfs pull

In [None]:
! pip install sentencepiece
! pip install absl-py
# ! pip install tensorflow-auto-detect
! pip install tensorflow-gpu

In [None]:
import os

if not os.path.exists('data/xlnet_cased_L-24_H-1024_A-16'):
    ! wget https://storage.googleapis.com/xlnet/released_models/cased_L-24_H-1024_A-16.zip
    ! mv cased_L-24_H-1024_A-16.zip data/
    ! cd data/ && unzip cased_L-24_H-1024_A-16.zip
else:
    print('Have XLNet model already!')

In [None]:
if not os.path.exists('xlnet'):
    ! git clone https://github.com/zihangdai/xlnet.git
else:
    print('Should have repo already!')
    ! cd xlnet && git pull

### Convert data format

https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e

In [None]:
import csv

import pandas as pd

In [None]:
data_cross_path = 'data/same-side-classification/cross-topic/{}.csv'
data_within_path = 'data/same-side-classification/within-topic/{}.csv'

cross_traindev_df = pd.read_csv(data_cross_path.format('training'),
                                quotechar='"',
                                quoting=csv.QUOTE_ALL,
                                encoding='utf-8',
                                escapechar='\\',
                                doublequote=False,
                                index_col='id')
cross_test_df = pd.read_csv(data_cross_path.format('test'),
                            quotechar='"',
                            quoting=csv.QUOTE_ALL,
                            encoding='utf-8',
                            escapechar='\\',
                            doublequote=False,
                            index_col='id')

within_traindev_df = pd.read_csv(data_within_path.format('training'),
                                 quotechar='"',
                                 quoting=csv.QUOTE_ALL,
                                 encoding='utf-8',
                                 escapechar='\\',
                                 doublequote=False,
                                 index_col='id')
within_test_df = pd.read_csv(data_within_path.format('test'),
                             quotechar='"',
                             quoting=csv.QUOTE_ALL,
                             encoding='utf-8',
                             escapechar='\\',
                             doublequote=False,
                             index_col='id')

In [None]:
# Adding a tag for the topics in focus: "gay marriage" and "abortion"
def add_tag(row):
    title = row['topic'].lower().strip()
    if "abortion" in title:
        row['tag'] = 'abortion'
    elif "gay marriage" in title:
        row['tag'] = 'gay marriage'
    else:
        row['tag'] = 'NA'
    return row


cross_traindev_df = cross_traindev_df.apply(add_tag, axis=1)
cross_test_df = cross_test_df.apply(add_tag, axis=1)

within_traindev_df = within_traindev_df.apply(add_tag, axis=1)
within_test_df = within_test_df.apply(add_tag, axis=1)

In [None]:
from sklearn.model_selection import train_test_split


def get_train_test_sets(df, ratio=0.30, random_state=1):
    X = df[['argument1', 'argument2', 'argument1_id', 'argument2_id', 'topic']]
    y = df[['is_same_side']]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=ratio, random_state=random_state, shuffle=True)
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_dev, y_train, y_dev = get_train_test_sets(within_traindev_df)
X_test = within_test_df
# X_train, X_dev, y_train, y_dev = get_train_test_sets(cross_traindev_df)
# X_test = cross_test_df

In [None]:
import os
from tqdm import tqdm

DATA_DIR = 'data/xlnet-in'
if not os.path.exists(DATA_DIR):
    os.mkdir(DATA_DIR)
    
DATA_DIR = os.path.join(DATA_DIR, 'ssc-within')
# DATA_DIR = os.path.join(DATA_DIR, 'ssc-cross')
if not os.path.exists(DATA_DIR):
    os.mkdir(DATA_DIR)

train_df = X_train.join(y_train)
dev_df = X_dev.join(y_dev)
test_df = X_test

with open(os.path.join(DATA_DIR, 'train.tsv'), 'w', encoding='utf-8') as fh:
    fh.write("label\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
    for _, row in tqdm(train_df.iterrows()):
        fh.write("{}\t{}\t{}\t{}\t{}\n".format(
            (1 if row['is_same_side'] else 0), row['argument1_id'],
            row['argument2_id'], row['argument1'], row['argument2']))

with open(os.path.join(DATA_DIR, 'dev.tsv'), 'w', encoding='utf-8') as fh:
    fh.write("label\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
    for _, row in tqdm(dev_df.iterrows()):
        fh.write("{}\t{}\t{}\t{}\t{}\n".format(
            (1 if row['is_same_side'] else 0), row['argument1_id'],
            row['argument2_id'], row['argument1'], row['argument2']))

with open(os.path.join(DATA_DIR, 'test.tsv'), 'w', encoding='utf-8') as fh:
    fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
    for id_, row in tqdm(test_df.iterrows()):
        fh.write("{}\t{}\t{}\t{}\t{}\n".format(id_, row['argument1_id'],
                                               row['argument2_id'],
                                               row['argument1'],
                                               row['argument2']))

In [None]:
! head -n 2 data/xlnet-in/ssc-within/train.tsv

### Variables

In [None]:
TASK_NAME = 'ssc-within' #@param{type:"string"}
SCRIPTS_DIR = 'xlnet' #@param {type:"string"}
DATA_DIR = 'data/xlnet-in/ssc-within' #@param {type:"string"}
OUTPUT_DIR = 'data/xlnet-out/ssc-within' #@param {type:"string"}
PRETRAINED_MODEL_DIR = 'data/xlnet_cased_L-24_H-1024_A-16' #@param {type:"string"}
CHECKPOINT_DIR = 'data/xlnet-chkp/ssc-within' #@param {type:"string"}

MAX_SEQ_LEN = '128'
BATCH_SIZE = '6'

### Run model (training & evaluation)

*not sure what **train/dev/test** data split is when using XLNet...*

In [None]:
train_command = "python3 " + SCRIPTS_DIR + "/run_classifier.py \
  --do_train=True \
  --do_eval=True \
  --eval_all_ckpt=True \
  --task_name=" + TASK_NAME + " \
  --data_dir=" + DATA_DIR + " \
  --output_dir=" + OUTPUT_DIR + " \
  --model_dir=" + CHECKPOINT_DIR + " \
  --uncased=False \
  --spiece_model_file=" + PRETRAINED_MODEL_DIR + "/spiece.model \
  --model_config_path=" + PRETRAINED_MODEL_DIR + "/xlnet_config.json \
  --init_checkpoint=" + PRETRAINED_MODEL_DIR + "/xlnet_model.ckpt \
  --max_seq_length=" + MAX_SEQ_LEN + " \
  --train_batch_size=" + BATCH_SIZE + " \
  --eval_batch_size=" + BATCH_SIZE + " \
  --num_hosts=1 \
  --num_core_per_host=1 \
  --learning_rate=2e-5 \
  --train_steps=40000 \
  --warmup_steps=500 \
  --save_steps=500 \
  --iterations=1000"

! {train_command}

In [None]:
help_command = "python3 " + SCRIPTS_DIR + "/run_classifier.py --help"

! {help_command}