# feedback-prize-effectiveness

## import libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sys
import os

import pandas as pd

import torch
import transformers
import datasets

import importlib
#importlib.reload(load_data)


: 

In [None]:
python -m pip install ipykernel -U --force-reinstall

: 

In [None]:
is_kaggle=os.getenv('KAGGLE_KERNEL_RUN_TYPE', '')

if not is_kaggle:

    sys.path.append('../')

    from src.utils import config
    #from src.utils import logger
    from src.data import load_data

    from src.models.huggingface_transformers import BERT

    from src.training.huggingface_transformers import trainer

: 

In [None]:
#logger=logger.Logger('test_log')

: 

In [None]:
CFG=config.CFG(competition_name='feedback-prize-effectiveness')
CFG.debug=True
CFG.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

: 

In [None]:
CFG.data_path

: 

## data

In [None]:
if not CFG.is_kaggle:
    load_data.download_data(CFG.data_path+CFG.competition_name+'/',CFG.competition_name)

: 

In [None]:
train_csv,test_csv,sample_submission_csv=load_data.load_data(CFG.data_path+CFG.competition_name+'/',debug=CFG.debug)

: 

In [None]:
train_csv

: 

In [None]:
test_csv

: 

## feature

In [None]:
model_name='RoBERTa'

training=True

if training:
    pretrained_model='roberta-base'
    #pretrained_model=CFG.output_path+''
else :
    pretrained_model=CFG.weights_path+model_name+'/'
    

model=BERT.transformers_RoBERTa(pretrained_path=pretrained_model,device=CFG.device)

: 

In [None]:
train_csv['input'] = train_csv.discourse_type + " " + model.tokenizer.sep_token + " " + train_csv.discourse_text
test_csv['input'] = test_csv.discourse_type + " " + model.tokenizer.sep_token + " " + test_csv.discourse_text

: 

In [None]:
train_csv['label']=train_csv['discourse_effectiveness']
train_csv = train_csv.replace({"label": {"Ineffective": 0, "Adequate": 1, "Effective": 2}})
#train_csv = train_csv.rename(columns = {"discourse_effectiveness": "label"})

: 

### train val split

In [None]:
essay_ids={'train':None,'val':None}
essay_ids['train'],essay_ids['val']=load_data.split(train_csv.essay_id.unique())

: 

In [None]:
train_csv['train_val']=train_csv.essay_id.isin(essay_ids['train']).replace({True: 'train',False: 'val'})

: 

In [None]:
df={'train':None,'val':None,'test':None}
df['train']=train_csv[train_csv['train_val']=='train']
df['val']=train_csv[train_csv['train_val']=='val']
df['test']=test_csv

: 

## training

In [None]:
dataset={'train':None,'val':None,'test':None}

dataset['train'] = datasets.Dataset.from_pandas(df['train'])
dataset['val'] = datasets.Dataset.from_pandas(df['val'])
dataset['test'] = datasets.Dataset.from_pandas(df['test'])

: 

In [None]:
df['val'].keys()

: 

In [None]:
dataset

: 

In [None]:
dataset['train'][0]

: 

In [None]:
model.tokenizer(dataset['train'][0]["input"], max_length=512, truncation=True, padding="max_length")

: 

In [None]:
def f(x):return model.tokenizer(x['input'],truncation=True)

dataset['train']=dataset['train'].map(
    f,
    batched=True,
    remove_columns=(
        'discourse_id',
        'essay_id',
        'discourse_text',
        'discourse_type',
        'discourse_effectiveness',
        'input',
        '__index_level_0__',#
    )
)
dataset['val']=dataset['val'].map(
    f,
    batched=True,
    remove_columns=(
        'discourse_id',
        'essay_id',
        'discourse_text',
        'discourse_type',
        'discourse_effectiveness',
        'input',
        '__index_level_0__',#
    )
)
dataset['test']=dataset['test'].map(
    f,
    batched=True,
    remove_columns=(
        'discourse_id',
        'essay_id',
        'discourse_text',
        'discourse_type',
        #'discourse_effectiveness',
        'input',
        #'__index_level_0__',#
    )
)

: 

In [None]:
dataset

: 

In [None]:
datasetdict=datasets.DatasetDict({
    "train":dataset['train'],
    "val": dataset['val'],
})

: 

In [None]:
trainer = trainer.transformers_get_trainer(CFG.output_path+model_name+'/',model.model,model.tokenizer,datasetdict)

: 

In [None]:
if training:
    trainer.train()
    #trainer.save_model(output_dir=CFG.output_path+model_name+'/')

: 

In [None]:
trainer.state.log_history

: 

In [None]:
plt.figure(figsize=(12, 8))

plt.plot(
    [x['epoch'] for x in trainer.state.log_history if 'loss' in x.keys()],
    [x['loss'] for x in trainer.state.log_history if 'loss' in x.keys()],
    label='train',
)

plt.plot(
    [x['epoch'] for x in trainer.state.log_history if 'eval_loss' in x.keys()],
    [x['eval_loss'] for x in trainer.state.log_history if 'eval_loss' in x.keys()],
    label='val'
)

plt.legend()
plt.plot()

: 

## prediction

In [None]:
preds=torch.Tensor(trainer.predict(dataset['test']).predictions)
preds = torch.nn.functional.softmax(preds,dim=1).numpy().astype(float)
preds

: 

In [None]:
submission=sample_submission_csv.copy()
submission['Ineffective'] = preds[:,0]
submission['Adequate'] = preds[:,1]
submission['Effective'] = preds[:,2]
submission

: 

In [None]:
submission.to_csv('submission.csv',index=False)

: 

In [None]:
#sample_submission_csv.to_csv('submission.csv',index=False)

: 