In [None]:
!pip install transformers datasets accelerate huggingface_hub

In [2]:
from transformers import DataCollatorWithPadding, AutoTokenizer, LlamaForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np
import torch
from huggingface_hub import login
from sklearn.model_selection import train_test_split

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [4]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
MODEL = 'meta-llama/Llama-2-7b-hf'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.pad_token = tokenizer.eos_token

In [6]:
# Define root_path and sampled_data_path

In [None]:
# Load sampled_data_df
sampled_data_df.head()

In [None]:
filtered_df = sampled_data_df[(sampled_data_df['Conceptual'] == 0.0) | (sampled_data_df['Conceptual'] == 1.0)]
filtered_df.shape

In [9]:
train_df, test_df = train_test_split(filtered_df, test_size=0.3, random_state=42)

In [10]:
# Save train_df and test_df

In [None]:
train_df_extracted = train_df[['comment_text', 'Conceptual']]
train_df_extracted['comment_text'] = train_df_extracted['comment_text'].astype(str)
train_df_extracted['Conceptual'] = train_df_extracted['Conceptual'].astype(int)
train_df_extracted.rename(columns={'comment_text': 'text', 'Conceptual': 'label'}, inplace=True)
test_df_extracted = test_df[['comment_text', 'Conceptual']]
test_df_extracted['comment_text'] = test_df_extracted['comment_text'].astype(str)
test_df_extracted['Conceptual'] = test_df_extracted['Conceptual'].astype(int)
test_df_extracted.rename(columns={'comment_text': 'text', 'Conceptual': 'label'}, inplace=True)

In [None]:
train_df_extracted.head()

In [13]:
def preprocess_function(examples, max_length=128):
    return tokenizer(examples["text"], truncation=True, max_length=max_length, padding=True)

In [14]:
# Create datasets from DataFrames
train_ds = Dataset.from_pandas(train_df_extracted)
test_ds = Dataset.from_pandas(test_df_extracted)

In [15]:
# Map preprocessing function to tokenize datasets
train_tokenized_ds = train_ds.map(preprocess_function, batched=True)
test_tokenized_ds = test_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/699 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [16]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

In [None]:
# Load the model
model = LlamaForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=2,
    torch_dtype=torch.bfloat16,
    device_map='auto'
)
model.config.pad_token_id = model.config.eos_token_id

In [18]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='output_dir',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='logging_dir',
    logging_steps=10,
    evaluation_strategy="epoch"  # Evaluate at the end of each epoch
)

In [None]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_ds,
    eval_dataset=test_tokenized_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
# Train the model
trainer.train()

In [None]:
# After training, make predictions
pred_output = trainer.predict(test_tokenized_ds)
logits = pred_output.predictions

In [None]:
# Sigmoid function to calculate probabilities
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [None]:
probs = sigmoid(logits[:, 1])

In [None]:
test_df['generated'] = probs
# Save test_df