## Sentiment Analysis with Camembert

### Load dataset from tblard/allocine

In [1]:
from datasets import load_dataset

ds = load_dataset("tblard/allocine")

In [30]:
ds = ds[:1000]

KeyError: "Invalid key: slice(None, 1000, None). Please first select a split. For example: `my_dataset_dictionary['train'][slice(None, 1000, None)]`. Available splits: ['test', 'train', 'validation']"

In [14]:
ds["train"]

Dataset({
    features: ['review', 'label'],
    num_rows: 160000
})

In [2]:
import pandas as pd

data_train = ds["train"].select(range(8000))
data_validation = ds["validation"].select(range(2000))
data_test = ds["test"].select(range(2000))

In [3]:
data_train


Dataset({
    features: ['review', 'label'],
    num_rows: 8000
})

### Model selection and experimentaion

In [4]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

classes = [0, 1]
tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-base")
model = AutoModelForSequenceClassification.from_pretrained("almanach/camembert-base", num_labels = len(classes))


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at almanach/camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test model

In [25]:
data_train["review"][4]

'Premier film de la saga Kozure Okami, "Le Sabre de la vengeance" est un très bon film qui mêle drame et action, et qui, en 40 ans, n\'a pas pris une ride.'

In [26]:
input_string = data_train["review"][4]
inputs = tokenizer(
    input_string,
    return_tensors="pt",  # PyTorch tensors
    truncation=True,      # Truncate if the text is too long
    padding=True,         # Add padding if needed
    max_length=256        # Ensure input length fits the model
)

model.eval()  # Set model to evaluation mode
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).item() 

print(predictions)

0


Data preprocessing

In [27]:
print(type(data_train))

<class 'datasets.arrow_dataset.Dataset'>


In [5]:
def tokenize(batch):
    return tokenizer(
        batch["review"], 
        padding="max_length", 
        truncation=True, 
        max_length=256  # Adjust max_length to suit your model
    )

model.train()
tokenized_train = data_train.map(tokenize, batched=True)
tokenized_validation = data_validation.map(tokenize, batched=True)
tokenized_test = data_test.map(tokenize, batched=True)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [6]:
import wandb
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_score
import numpy as np

wandb.login()
wandb.init(project="Frech-sentiment-analysis", name="fine_tune_camembert-base")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, np.ndarray):
        logits = torch.tensor(logits)
    if isinstance(labels, np.ndarray):
        labels = torch.tensor(labels)
    predictions = torch.argmax(logits, dim=-1)
    accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
    precision = precision_score(
        labels.cpu().numpy(), 
        predictions.cpu().numpy(), 
        average="weighted",  # Weighted precision
        zero_division=0      # Handle cases where there are no true instances of a class
    )
    return {"accuracy": accuracy, "precision": precision}

training_arguments = TrainingArguments(
    output_dir='/.results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="wandb",  # Enable wandb tracking
    logging_dir="./logs",  # Log directory
    logging_steps=10,      # Adjust logging frequency
)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    compute_metrics=compute_metrics,
)

trainer.train()

model.save_pretrained("finetuned_model")
tokenizer.save_pretrained("finetuned_model")

wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvisalkao21[0m ([33mvisalkao21-imt-mines-al-s[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,Accuracy,Precision
1,0.2243,0.166868,0.954,0.955621
2,0.224,0.174095,0.9595,0.9595
3,0.1454,0.194145,0.9615,0.961873


0,1
eval/accuracy,▁▆█
eval/loss,▁▃█
eval/precision,▁▅█
eval/runtime,▆█▁
eval/samples_per_second,▃▁█
eval/steps_per_second,▂▁█
train/epoch,▁▁▁▂▂▂▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇██
train/global_step,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train/grad_norm,▁▁▁▁▂▁▂▂█▁▁▁▁▂▁▁▂▂▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,█████▇▇▇▇▇▇▇▇▇▆▆▆▆▅▅▅▅▅▅▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁

0,1
eval/accuracy,0.9615
eval/loss,0.19415
eval/precision,0.96187
eval/runtime,194.3707
eval/samples_per_second,10.29
eval/steps_per_second,1.286
total_flos,3157332664320000.0
train/epoch,3.0
train/global_step,3000.0
train/grad_norm,0.04632


In [7]:
from huggingface_hub import login
login()  # This will ask for your Hugging Face token. You can get it from https://huggingface.co/settings/tokens


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
from huggingface_hub import create_repo

# Create a new repository on Hugging Face
create_repo("visalkao/sentiment-analysis-french", exist_ok=True)  # Set exist_ok=True if the repo already exists


RepoUrl('https://huggingface.co/visalkao/sentiment-analysis-french', endpoint='https://huggingface.co', repo_type='model', repo_id='visalkao/sentiment-analysis-french')

In [None]:
from huggingface_hub import HfApi, Repository

# Define model and tokenizer paths
model_path = "sentiment-analysis-french-folder"
repo_name = "visalkao/sentiment-analysis-french"  # Your Hugging Face model repository name

# Initialize the repository
repo = Repository(local_dir=model_path, clone_from=f"https://huggingface.co//{repo_name}", use_auth_token="")




Cloning https://huggingface.co/visalkao/sentiment-analysis-french into local empty directory.


In [18]:
repo.push_to_hub(commit_message="Upload Models")

Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file model.safetensors:   0%|          | 1.00/422M [00:00<?, ?B/s]

To https://huggingface.co/visalkao/sentiment-analysis-french
   40afd2c..c58bd05  main -> main



'https://huggingface.co/visalkao/sentiment-analysis-french/commit/c58bd05fc31c852ded1b066e6e5366a418d07d24'