In [5]:
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from huggingface_hub import notebook_login
from transformers import AutoModelForSequenceClassification,AutoTokenizer, AutoConfig
from transformers import TrainingArguments,Trainer
from datasets import Dataset,DatasetDict
import numpy as np 
import pandas as pd 
import kaggle, zipfile
import evaluate

In [None]:
path = 'nlp-getting-started'
df = pd.read_csv(path + '/train.csv')
df.drop(columns = ['keyword', 'location'], inplace = True)
# df['target'] = df['target'].astype(float)
ds = Dataset.from_pandas(df)
model_nm = 'microsoft/deberta-v3-small'
tokz = AutoTokenizer.from_pretrained(model_nm)
def tok_func(x): return tokz(x["text"])
tok_ds = ds.map(tok_func, batched=True)
tok_ds = tok_ds.rename_columns({'target':'labels'})
dds = tok_ds.train_test_split(0.25, seed=42)
eval_df = pd.read_csv(path + '/test.csv')
eval_df.drop(columns = ['keyword', 'location'], inplace = True)
#eval_df['target'] = df['target'].astype(float)
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched=True)

In [None]:
# define the mappings as dictionaries
label2id = {"Chance of being a disaster": 0}
id2label = {"0": "Chance of being a disaster"}
# define model checkpoint - can be the same model that you already have on the hub
model_ckpt = 'models/vnsrz/outputs'
# define config
config = AutoConfig.from_pretrained(model_ckpt, label2id=label2id, id2label=id2label)
# load model with config
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config)
# export model
model.save_pretrained("model/twt")

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load('f1')
    logits, labels = eval_preds
    predictions = np.clip(logits, 0, 1)
    return metric.compute(predictions=predictions, references=labels)

bs = 64
epochs = 4
lr = 8e-5

args = TrainingArguments(
    'outputs', 
    learning_rate=lr, 
    warmup_ratio=0.1, 
    lr_scheduler_type='cosine', 
    fp16=True, 
    evaluation_strategy="epoch", 
    per_device_train_batch_size=bs, 
    per_device_eval_batch_size=bs*2, 
    num_train_epochs=epochs, 
    weight_decay=0.01, 
    report_to='none'
    )

model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
trainer = Trainer(
    model, 
    args, 
    train_dataset=dds['train'], 
    eval_dataset=dds['test'],
    tokenizer=tokz, 
    compute_metrics=compute_metrics
    )

In [None]:
trainer.train();