# 60-modeling-huggingface
> Training models

In this notebook, we train models based on the feature set of interest to predict the outcomes.  To use this notebook out-of-the-box, make sure that your data is stored as a csv file with columns named `text`, `labels`, and `split`.  You can also arrange this dataframe in the notebook prior to modeling if desired.

#### Common helpful packages

In [None]:
#Data analysis and processing
import pandas as pd
import numpy as np

#machine learning
from sklearn.model_selection import train_test_split

#transformers
from transformers import pipeline
from datasets import load_dataset, load_metric, Dataset
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

#plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Constants/globals
cleaned_data_filename = ''

# Load data

In [None]:
full_df = pd.read_csv(cleaned_data_filename)
print(full_df.shape)
full_df.head()

# Load tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
tokenizer.name_or_path

# Tokenize inputs and convert to PyTorch dataset

In [None]:
#create tokenized representations
train_encodings = tokenizer(full_df.query('split==0')['text'].tolist(), truncation=True, padding='longest')
val_encodings = tokenizer(full_df.query('split==1')['text'].tolist(), truncation=True, padding='longest')

In [None]:
#helpers for class size and class names
no_classes = len(full_df.query('split==0')['labels'].unique())
train_classes = [label_dict[class_ind] for class_ind in range(no_classes)]

In [None]:
#Create custom Datasets Class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

#Create datasets from encodings
train_dataset = CustomDataset(train_encodings, full_df.query('split==0')['labels'].tolist())
val_dataset = CustomDataset(val_encodings, full_df.query('split==1')['labels'].tolist())

# Create model for task

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=no_classes, id2label=label_dict)
model.name_or_path

# Train model

In [None]:
#set parameters around training
training_args = TrainingArguments("test_trainer", logging_strategy='epoch')

Below, we use _accuracy_ as the metric for our model to assess performance during evaluation.

In [None]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Now, let's train the model.

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)
trainer.train()

# Prediction and evaluation
## Evaluate

In [None]:
trainer.evaluate(train_dataset)

## Predict

In [None]:
trainer.predict(train_dataset)

# Model saving
Note that after we've saved the model below, we'll be able to use the pipeline function to load this model and use it for inference.

In [None]:
trainer.save_model('bert-tuned-model')