In [1]:
!pip install transformers datasets boto3 sagemaker scikit-learn --upgrade

Collecting scikit-learn
  Using cached scikit_learn-1.5.2-cp310-cp310-win_amd64.whl.metadata (13 kB)
Using cached scikit_learn-1.5.2-cp310-cp310-win_amd64.whl (11.0 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.3.0
    Uninstalling scikit-learn-1.3.0:
      Successfully uninstalled scikit-learn-1.3.0
Successfully installed scikit-learn-1.5.2


In [1]:
import os
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

In [13]:
# Load dataset
df = pd.read_csv( "C:/Users/hp/Desktop/Project/minor_project/DATA/training.1600000.processed.noemoticon.csv",
                 encoding='ISO-8859-1', header=None)
df = df[[0, 5]]  # Keep only label and text columns
df.columns = ['label', 'text']

# Map labels to 0 (negative), 1 (neutral), and 2 (positive)
label_mapping = {0: 0, 2: 1, 4: 2}
df['label'] = df['label'].map(label_mapping)

# Sample a subset for faster training
df_subset = df.sample(n=100000, random_state=42)
 

In [14]:
df

Unnamed: 0,label,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,2,Just woke up. Having no school is the best fee...
1599996,2,TheWDB.com - Very cool to hear old Walt interv...
1599997,2,Are you ready for your MoJo Makeover? Ask me f...
1599998,2,Happy 38th Birthday to my boo of alll time!!! ...


In [15]:
train_df, test_df = train_test_split(df_subset, test_size=0.2, random_state=42)


In [16]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


In [17]:
train_dataset

Dataset({
    features: ['label', 'text', '__index_level_0__'],
    num_rows: 80000
})

In [18]:
test_dataset

Dataset({
    features: ['label', 'text', '__index_level_0__'],
    num_rows: 20000
})

In [19]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenization function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=128)

# Apply tokenization
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [20]:
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
 training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    save_total_limit=1,
    save_strategy="epoch"
)


In [20]:
!pip install transformers[torch]

Collecting accelerate>=0.26.0 (from transformers[torch])
  Downloading accelerate-1.1.1-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.1.1-py3-none-any.whl (333 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.1.1


In [24]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}


In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)


In [None]:
os.environ["WANDB_DISABLED"] = "true"
trainer.train()


Epoch,Training Loss,Validation Loss


In [None]:
model.save_pretrained("distilbert_sentiment_model")
tokenizer.save_pretrained("distilbert_sentiment_model")
