In [None]:
#default_exp en_task.run_folds_task2

In [None]:
#export
import os

import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import transformers

import Hasoc.config as config
import Hasoc.utils.utils as utils
import Hasoc.utils.engine as engine
import Hasoc.model.model as model
import Hasoc.dataset.dataset as dataset

from functools import partial
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from catalyst.data.sampler import BalanceClassSampler
from transformers import AdamW, get_linear_schedule_with_warmup

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
#export
SEED = 42
utils.seed_everything(SEED)

In [None]:
#export
df = pd.read_csv(config.DATA_PATH/'fold_df.csv')

In [None]:
#hide
df.head(2)

Unnamed: 0,tweet_id,text,task1,task2,ID,kfold_task1,kfold_task2
0,1.126953e+18,"We need a word for ‘going somewhere alone,sitt...",NOT,NONE,hasoc_2020_en_1503,2,0
1,1.123482e+18,RT @RiverCityLabs: Come and work from our spac...,NOT,NONE,hasoc_2020_en_3570,2,0


In [None]:
#hide
df.shape

(3708, 7)

In [None]:
#export
le = LabelEncoder()
le.fit_transform(df.task2)
le.classes_

array(['HATE', 'NONE', 'OFFN', 'PRFN'], dtype=object)

In [None]:
#export
df['task2_encoded'] = le.transform(df.task2.values)

In [None]:
os.listdir(config.DATA_PATH)

['en_task_a', 'raw', 'fold_df.csv', '.ipynb_checkpoints']

In [None]:
#export
test_df = pd.read_csv(config.DATA_PATH/'en_task_a/english_test.csv')

In [None]:
#hide
test_df.head()

Unnamed: 0,tweet_id,text,task1,task2,ID
0,1130081762154090497,RT @delmiyaa: Samini resetting the show and mo...,NOT,NONE,hasoc_2020_en_2713
1,1130048316807491584,@Swxnsea how do you know that he’s left?,HOF,NONE,hasoc_2020_en_3874
2,1123657766143504386,Tried to get Divock Origi on a free seeing as ...,NOT,NONE,hasoc_2020_en_281
3,1126782963042013186,RT @nutclusteruwu: that....is yalls stupid whi...,HOF,PRFN,hasoc_2020_en_2026
4,1130159113529434113,&amp; IT DID. But a bitch got big girls things...,HOF,PRFN,hasoc_2020_en_4023


In [None]:
#export
def run(fold, num_epochs=6):
    NUM_EPOCHS = num_epochs
    train_df = df.query(f'kfold_task2!={fold}').reset_index(drop=True)
    valid_df = df.query(f'kfold_task2=={fold}').reset_index(drop=True)

    #export
    train_ds = utils.create_loader(train_df.text.values, train_df.task2_encoded, bs=config.TRAIN_BATCH_SIZE,
                                   ret_dataset=True)
    train_dl = utils.create_loader(train_df.text.values, train_df.task2_encoded, bs=config.TRAIN_BATCH_SIZE,
                                   sampler=BalanceClassSampler(labels=train_ds.get_labels(), mode="upsampling"))
    valid_dl = utils.create_loader(valid_df.text.values, valid_df.task2_encoded, bs=config.VALID_BATCH_SIZE)

    #export
    modeller = model.HasocModel(len(le.classes_), drop=0.6)

    #export
    model_params = list(modeller.named_parameters())

    #export
    # we don't want weight decay for these
    no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias']

    optimizer_params = [
        {'params': [p for n, p in model_params if n not in no_decay],
        'weight_decay':0.001},
        #  no weight decay should be applied
        {'params': [p for n, p in model_params if n in no_decay],
        'weight_decay':0.0}
    ]

    #export
    # lr = config.LR
    lr = 1e-4

    #export
    optimizer = AdamW(optimizer_params, lr=lr)

    #export
    num_train_steps = int(len(df) / config.TRAIN_BATCH_SIZE * config.NUM_EPOCHS)

    #export
    scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,
                                                    num_warmup_steps=20,
                                                    num_training_steps=num_train_steps-20)

    #export
    # fit = engine.BertFitter(modeller, (train_dl, valid_dl), optimizer, nn.CrossEntropyLoss(), partial(f1_score, average='macro'), config.DEVICE, scheduler=scheduler, log_file='en_task2_log.txt')
    fit = engine.BertFitter(modeller, (train_dl, valid_dl), optimizer, utils.LabelSmoothingCrossEntropy(), partial(f1_score, average='macro'), config.DEVICE, scheduler=scheduler, log_file='en_task2_log.txt')

    #export
    fit.fit(NUM_EPOCHS, model_path=os.path.join(config.MODEL_PATH/f'en_task2_{fold}.pth'), show_graph=False)

    #export
    test_dl = utils.create_loader(test_df.text.values, lbls=[None]*len(test_df.text.values), bs=config.VALID_BATCH_SIZE, is_test=True)

    #export
    modeller = model.HasocModel(len(le.classes_))
    modeller.load_state_dict(torch.load(config.MODEL_PATH/f'en_task2_{fold}.pth'))

    #export
    preds = engine.get_preds(test_dl.dataset, test_dl, modeller, config.DEVICE, ensemble_proba=True)

    np.save(os.path.join('..', 'outputs', f'submission_EN_B_{fold}.npy'), preds)

In [None]:
#export
for i in range(5):
    run(i)

epoch,train_loss,valid_loss,metric,time
1,0.838752,0.947949,0.552764,0:5:55
2,0.433689,0.948502,0.589301,0:6:1
3,0.378118,0.956992,0.586607,0:5:49
4,0.380649,0.957173,0.58655,0:3:9
5,0.377751,0.957177,0.586549,0:3:8
6,0.377655,0.957177,0.586549,0:4:53


epoch,train_loss,valid_loss,metric,time
1,0.871337,0.950515,0.52242,0:3:35
2,0.445016,0.923976,0.55445,0:3:14
3,0.384882,0.918306,0.587731,0:4:41
4,0.382308,0.918185,0.588439,0:5:58
5,0.379654,0.918182,0.588454,0:6:1
6,0.381721,0.918182,0.588454,0:4:51


epoch,train_loss,valid_loss,metric,time
1,0.930174,0.995411,0.501469,0:5:54
2,0.474099,0.926649,0.578121,0:6:1
3,0.39088,0.946884,0.567922,0:5:11
4,0.389421,0.947314,0.567705,0:3:9
5,0.393117,0.947323,0.5677,0:3:9
6,0.387758,0.947324,0.5677,0:5:30


epoch,train_loss,valid_loss,metric,time
1,0.843593,0.86411,0.582778,0:3:14
2,0.442409,0.88897,0.59842,0:3:15
3,0.385222,0.887524,0.621946,0:3:6
4,0.380899,0.887494,0.622446,0:3:3
5,0.375378,0.887493,0.622457,0:3:4
6,0.379337,0.887493,0.622457,0:3:2


epoch,train_loss,valid_loss,metric,time
1,0.834347,0.813219,0.598235,0:3:3
2,0.4314,0.8679,0.617823,0:3:3
3,0.387252,0.860864,0.623151,0:3:3
4,0.381422,0.860714,0.623265,0:3:3
5,0.380474,0.860711,0.623267,0:3:3
6,0.381035,0.860711,0.623267,0:3:3


In [None]:
#export
def ensemble():
    preds_0 = np.load(os.path.join('..', 'outputs', f'submission_EN_B_0.npy'))
    preds_1 = np.load(os.path.join('..', 'outputs', f'submission_EN_B_1.npy'))
    preds_2 = np.load(os.path.join('..', 'outputs', f'submission_EN_B_2.npy'))
    preds_3 = np.load(os.path.join('..', 'outputs', f'submission_EN_B_3.npy'))
    preds_4 = np.load(os.path.join('..', 'outputs', f'submission_EN_B_4.npy'))

    preds = (preds_0 + preds_1 + preds_2 + preds_3 + preds_4) / 5

    preds = le.inverse_transform(torch.tensor(preds).argmax(dim=-1).numpy())

    #export
    sub = pd.read_csv(config.DATA_PATH/'en_task_a/english_test.csv')

    #export
    submission_en_task1_df = test_df.drop(columns=['text', 'task1', 'task2']).copy()

    #export
    submission_en_task1_df['task2'] = preds

    #export
    submission_en_task1_df.to_csv(os.path.join('..', 'outputs', f'submission_EN_B.csv'), index=False)

In [None]:
#export
ensemble()

In [None]:
pd.read_csv(os.path.join('..', 'outputs', f'submission_EN_B.csv'))

Unnamed: 0,tweet_id,ID,task2
0,1130081762154090497,hasoc_2020_en_2713,NONE
1,1130048316807491584,hasoc_2020_en_3874,NONE
2,1123657766143504386,hasoc_2020_en_281,NONE
3,1126782963042013186,hasoc_2020_en_2026,PRFN
4,1130159113529434113,hasoc_2020_en_4023,PRFN
...,...,...,...
809,1127061607433900032,hasoc_2020_en_1212,NONE
810,1123685826074951681,hasoc_2020_en_3435,NONE
811,1126882552587927552,hasoc_2020_en_3987,HATE
812,1130294488859996160,hasoc_2020_en_1176,NONE


In [None]:
test_df.query('tweet_id == "1126782963042013186"').text.values[0]

'RT @nutclusteruwu: that....is yalls stupid white ass reactions meeting tom holland in disneyland? are you fucking kidding me i would have d…'