In [11]:
#!pip install evaluate
#!pip install pytorch
import pandas as pd
import evaluate
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

csv = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

csv['sentiment'] = csv['sentiment'].map({'positive': 1, 'negative': 0})
csv = csv.rename(columns={'sentiment': 'label'})
#csv = csv[:50]

dataset = Dataset.from_dict(csv)

# 70% for training
train_test_split_ratio = 0.7
train_dataset, temp_dataset = dataset.train_test_split(test_size=1-train_test_split_ratio, seed=42).values()

# 15% for validation and testing each
val_test_split_ratio = 0.5
valid_dataset, test_dataset = temp_dataset.train_test_split(test_size=val_test_split_ratio, seed=42).values()


model_name = "distilbert/distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(examples):
    return tokenizer(examples["review"], truncation=True, padding="max_length")

tokenized_dataset = dataset.map(preprocess, batched=True)

# 70% for training
train_test_split_ratio = 0.7
train_dataset, temp_dataset = tokenized_dataset.train_test_split(test_size=1-train_test_split_ratio, seed=42).values()

# 15% for validation and testing each
val_test_split_ratio = 0.5
valid_dataset, test_dataset = temp_dataset.train_test_split(test_size=val_test_split_ratio, seed=42).values()


accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = precision_metric.compute(predictions=predictions, references=labels, average="binary")["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels, average="binary")["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="binary")["f1"]

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=1,  # Log more frequently
    report_to="none",  # Avoid sending logs to external services
    logging_first_step=True  # Log the first step
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

model.save_pretrained("/kaggle/working/distilbert")
tokenizer.save_pretrained("/kaggle/working/distilbert")


/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2449,0.196159,0.929333,0.907517,0.953324,0.929857
2,0.1364,0.208775,0.936267,0.9284,0.943012,0.935649


('/kaggle/working/distilbert/tokenizer_config.json',
 '/kaggle/working/distilbert/special_tokens_map.json',
 '/kaggle/working/distilbert/vocab.txt',
 '/kaggle/working/distilbert/added_tokens.json',
 '/kaggle/working/distilbert/tokenizer.json')

In [15]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
model.push_to_hub("sentiment-model")
tokenizer.push_to_hub("sentiment-model")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/wetsq/sentiment-model/commit/98cf479295a9a32f64b686c79977b6fc43779510', commit_message='Upload tokenizer', commit_description='', oid='98cf479295a9a32f64b686c79977b6fc43779510', pr_url=None, repo_url=RepoUrl('https://huggingface.co/wetsq/sentiment-model', endpoint='https://huggingface.co', repo_type='model', repo_id='wetsq/sentiment-model'), pr_revision=None, pr_num=None)

https://huggingface.co/wetsq/sentiment-model

https://github.com/wetsq/sentiment-analysis

https://www.youtube.com/watch?v=CQhhxNUcHrg