In [3]:
%%capture
!pip install transformers
!pip install -U sentence-transformers
!pip install datasets
!pip install evaluate
!pip install googletrans==3.1.0a0

from google.colab import drive
drive.mount('/content/drive')

Load the dataset and remove string noise

In [None]:
import pandas as pd

recipes = pd.read_csv("/content/drive/MyDrive/Semantics In Intelligent Information Access/GPT trial/dataset_covered_recipes.csv", 
                          delimiter=';', 
                          quotechar='"', 
                          header=0, 
                          index_col = 0)

recipes = recipes.rename(columns={"SUSTAINABILITY": "sustainability"})
recipes['sustainability'] = recipes['sustainability'].str.lower()

# remove empty rows
# recipes = recipes[recipes["sustainability"].astype(str).str.len() != 0]
recipes = recipes.dropna(subset=['sustainability'])
display(recipes.head())

valori_unici = recipes['sustainability'].unique()
print('\n',valori_unici)

Unnamed: 0_level_0,title,url,ingredients,sustainability
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
39637,Sally's Coleslaw Dressing Recipe,http://cookeatshare.com/recipes/sally-s-colesl...,cabagge,high
28634,Canning Pumpkin,http://www.food.com/recipe/canning-pumpkin-395684,pumpkin,high
2427,Pumpkin Puree,http://cooking.nytimes.com/recipes/4333,pumpkin,high
19530,Grilled ORE-IDA Roasted Original Potatoes,http://www.kraftrecipes.com/recipes/grilled-or...,potato,high
7258,Baked Potato Toppings Recipe,http://cookeatshare.com/recipes/baked-potato-t...,potato,high



 ['high' 'medium' 'low']


In [None]:
import numpy as np

rec_perc = recipes.loc[:, ["title","ingredients", "sustainability"]].copy()
# rec_perc['text'] = rec_perc['title'] + ' ' + rec_perc['ingredients']
rec_perc['text'] = rec_perc['ingredients']
rec_perc.drop(['title', 'ingredients'], axis=1, inplace=True)
rec_perc.rename(columns={'sustainability': 'label'}, inplace=True)
rec_ft = rec_perc
display(rec_perc)

Unnamed: 0_level_0,label,text
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1
39637,high,cabagge
28634,high,pumpkin
2427,high,pumpkin
19530,high,potato
7258,high,potato
...,...,...
38574,low,"olive oil,sesame oil,garlic powder,plum,soy sa..."
31046,low,"dark chocolate,olive oil,garlic powder,lime ju..."
39216,low,"red pepper or cayenne spices,fresh parsley,cor..."
39695,low,"grapes,olive oil,cucumber,feta,dried basil,bee..."


# Model are able to learn (and classify)

In [None]:
from transformers import pipeline, set_seed
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

import numpy as np
from sklearn.model_selection import train_test_split

# create an array with the indices of the examples in the DataFrame
indices = np.arange(len(rec_ft))

train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)

# create a boolean mask to select the examples in train and test
train_mask = np.isin(indices, train_indices)
test_mask = np.isin(indices, test_indices)

# create the train and test DataFrames using the masks
train_data = rec_ft[train_mask]
test_data = rec_ft[test_mask]

# group the train and test data by label
train_grouped = train_data.groupby('label')
test_grouped = test_data.groupby('label')

# sample the examples for each group (i.e., class) without replacement using the indices
train_examples = []
test_examples = []
for name, group in train_grouped:
    train_examples.append(group.sample(n=480, random_state=42))
for name, group in test_grouped:
    test_examples.append(group.sample(n=120, random_state=42))

# concatenate the examples into train and test DataFrames
train_data = pd.concat(train_examples)
test_data = pd.concat(test_examples)

train_dataset = Dataset.from_pandas(train_data)#, split='train')
test_dataset = Dataset.from_pandas(test_data)

dataset_1 = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# tokenize dataset
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", max_length=100, truncation=True)

tokenized_datasets = dataset_1.map(tokenize_function, batched=True)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42)
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42)

#train_data = train_data.reset_index(drop=True)
#test_data = test_data.reset_index(drop=True)
#display(train_data)
#display(test_data)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {'accuracy': acc,}
  
# train
from transformers import RobertaForSequenceClassification
from transformers import TrainingArguments, Trainer
import evaluate
import psutil
import logging

# setup logging to print accuracy during training
logging.basicConfig(level=logging.INFO)

training_args = TrainingArguments(output_dir="model_weights",  num_train_epochs=5)
metric = evaluate.load("accuracy")

model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=3,
    output_attentions=False,
    output_hidden_states=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

Step,Training Loss
500,0.4115


Saving model checkpoint to model_weights/checkpoint-500
Configuration saved in model_weights/checkpoint-500/config.json
Model weights saved in model_weights/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=900, training_loss=0.33028558519151474, metrics={'train_runtime': 171.615, 'train_samples_per_second': 41.954, 'train_steps_per_second': 5.244, 'total_flos': 370003243680000.0, 'train_loss': 0.33028558519151474, 'epoch': 5.0})

In [None]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 360
  Batch size = 8


{'eval_loss': 0.20104625821113586,
 'eval_accuracy': 0.95,
 'eval_runtime': 2.3731,
 'eval_samples_per_second': 151.703,
 'eval_steps_per_second': 18.963,
 'epoch': 5.0}