# Fine Tuning Models - Using Custom Data
> Fine-tuning using your own data

In this notebook, we'll use:https://huggingface.co/transformers/custom_datasets.html as a guide for our work.  The notebook headers mirror the ones of notebook 3.  However, in this notebook, we'll use our own custom data available through our `workshop-files` subdirectory.  Some code has already been provided from Notebook 2.  Other code, we will write together.  See the solutions notebook if you fall behind!

# 0. Preliminaries
You can use the following code to mount your drive and cd into the relevant directory.  Uncomment the git clone command if you don't have the `deep-learning-intensive` repo already cloned.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd drive/MyDrive
#!git clone https://github.com/vanderbilt-data-science/deep-learning-intensive.git
%cd deep-learning-intensive

# 1.  Installing Required Packages
Note that this is mostly required if you're on Google Colab.

In [None]:
! pip install transformers
! pip install datasets

# 2. Importing Packages for Use

In [None]:
import glob

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from datasets import load_dataset, load_metric, Dataset
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# 3. Formulate Data into Dataset
## Read in data and convert to dataframe

In [None]:
#get filenames list
filenames = glob.glob('workshop-files/*.txt')

#read file contents
file_contents = []
for file in filenames:
    with open(file, 'r') as f:
        file_contents.append(f.read())

#convert to df
tinfo_df = pd.DataFrame({'filename':[fname.split('\\')[-1] for fname in filenames], 'text':file_contents})
tinfo_df['article_id'] = tinfo_df['filename'].apply(lambda x: int(x.split('.')[0]))

#read author csv
author_df = pd.read_csv('workshop-files/author_data.csv')

#join
full_df = pd.merge(author_df, tinfo_df, on='article_id')
full_df.head()

Unnamed: 0,last_name,first_name,age,years_of_journalism,college major,article_id,filename,text
0,west,enrique,56,12,humanities,551293,551293.txt,"The rain and wind abruptly stopped, but the sk..."
1,braun,damien,43,13,humanities,373587,373587.txt,She patiently waited for his number to be call...
2,osborn,ellie,22,2,engineering,597061,597061.txt,The chair sat in the corner where it had been ...
3,vega,cierra,67,34,science,434648,434648.txt,The computer wouldn't start. She banged on the...
4,cantrell,alden,53,23,science,532970,532970.txt,Do you really listen when you are talking with...


## Add training labels and split column
Note that our data currently doesn't have any training labels, so I'll make some up here add concatenate them to the dataframe.  I'll also add a split column.

In [None]:
#create training labels
labels = ['elle']*5 + ['people']*10 + ['ebony']*5
full_df['labels'] = pd.Series(labels).sample(frac=1, random_state=2345).reset_index(drop=True)

#create split labels
splits = [0]*15 + [1]*5
full_df['split'] = pd.Series(splits).sample(frac=1, random_state=2323).reset_index(drop=True)

#view
full_df.head()

Unnamed: 0,last_name,first_name,age,years_of_journalism,college major,article_id,filename,text,labels,split
0,west,enrique,56,12,humanities,551293,551293.txt,"The rain and wind abruptly stopped, but the sk...",people,0
1,braun,damien,43,13,humanities,373587,373587.txt,She patiently waited for his number to be call...,people,0
2,osborn,ellie,22,2,engineering,597061,597061.txt,The chair sat in the corner where it had been ...,elle,0
3,vega,cierra,67,34,science,434648,434648.txt,The computer wouldn't start. She banged on the...,people,0
4,cantrell,alden,53,23,science,532970,532970.txt,Do you really listen when you are talking with...,ebony,1


## Convert into Dataset

In [None]:
train_ds = Dataset.from_pandas(full_df.query('split==0'))
valid_ds = Dataset.from_pandas(full_df.query('split==1'))

# 4. Load Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenizer.name_or_path

'bert-base-cased'

# 5. Tokenize Inputs

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train = train_ds.map(tokenize_function, batched=True)
tokenized_valid = valid_ds.map(tokenize_function, batched=True)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




# 6. Split Data
Already done above!  Whoo!

# 7. Create Model for Task

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=len(full_df.query('split==0')['labels'].unique()))
model.name_or_path

# 8. Setup arguments for training

In [None]:
training_args = TrainingArguments("test_trainer")
training_args

TrainingArguments(output_dir=test_trainer, overwrite_output_dir=False, do_train=False, do_eval=False, do_predict=False, evaluation_strategy=IntervalStrategy.NO, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=SchedulerType.LINEAR, warmup_ratio=0.0, warmup_steps=0, logging_dir=runs\May25_18-46-17_PROVL-CX0L7Y2, logging_strategy=IntervalStrategy.STEPS, logging_first_step=False, logging_steps=500, save_strategy=IntervalStrategy.STEPS, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level=O1, fp16_backend=auto, fp16_full_eval=False, local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_na

# 9. Train model (no output)

In [None]:
#trainer = Trainer(model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset)

In [None]:
#trainer.train()

# 10. Train model using evaluation metric

In [None]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    compute_metrics=compute_metrics
)
trainer.train()