# **Load and Parse data**

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("bittlingmayer/amazonreviews")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'amazonreviews' dataset.
Path to dataset files: /kaggle/input/amazonreviews


In [2]:
import numpy as np
import pandas as pd
import os

In [3]:
file_path = os.listdir(path)

In [4]:
file_path

['test.ft.txt.bz2', 'train.ft.txt.bz2']

In [5]:
import bz2
import pandas as pd

def load_amazon_reviews(bz2_path, n=50000):
    texts, labels = [], []
    with bz2.open(bz2_path, "rt", encoding="utf-8") as f:
        for i, line in enumerate(f):
            parts = line.strip().split(" ", 1)
            if len(parts) == 2:
                label = 1 if parts[0] == "__label__2" else 0
                text = parts[1]
                labels.append(label)
                texts.append(text)
            if i >= n:  # limit for faster training
                break
    return pd.DataFrame({"text": texts, "label": labels})

df = load_amazon_reviews(os.path.join(path, "train.ft.txt.bz2"), n=50000)


In [6]:
df.head()

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tra...,1
1,The best soundtrack ever to anything.: I'm rea...,1
2,Amazing!: This soundtrack is my favorite music...,1
3,Excellent Soundtrack: I truly like this soundt...,1
4,"Remember, Pull Your Jaw Off The Floor After He...",1


In [7]:
import torch
from torch.utils.data import DataLoader, random_split, Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.optim import AdamW
from tqdm import tqdm
from datasets import load_dataset


# **EDA**

In [8]:
df.shape

(50001, 2)

In [9]:
df.isnull().sum()

Unnamed: 0,0
text,0
label,0


In [10]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,25507
0,24494


# **Train Test split**

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    random_state=42
)


In [13]:
#df['text']
#df['text'].to_list()

# **Tokenize using Transformer tokenizer - DistilBERT**

In [14]:
from transformers import DistilBertTokenizerFast

In [15]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [16]:
train_encodings = tokenizer(train_texts, truncation = True, padding = True, max_length = 128)
test_encodings = tokenizer(test_texts, truncation = True, padding = True, max_length = 128)

# **Create custom Dataset with pytorch**

In [17]:
import torch

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)



In [18]:
train_dataset = SentimentDataset(train_encodings, train_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


# **Load and fine tune the model**

In [19]:
from transformers import DistilBertForSequenceClassification

In [20]:

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Move model to device**

In [21]:
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)
# model

# **Set up optimizer**

In [22]:
from torch.optim import AdamW

In [23]:
optimizer = AdamW(model.parameters(), lr=5e-5)

# **Define training loop and evaluation**

In [24]:
# Install required libraries
!pip install transformers datasets tqdm

import torch
from torch.utils.data import DataLoader, random_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.optim import AdamW
from tqdm import tqdm
from datasets import load_dataset

# 1. Set device

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 2. Load dataset (example)

dataset = load_dataset("amazon_polarity")  # binary sentiment dataset
dataset = dataset['train'].shuffle(seed=42).select(range(5000))  # small subset for demo

# 3. Tokenizer

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch['content'], padding=True, truncation=True, max_length=128)

dataset = dataset.map(tokenize, batched=True)

# Hugging Face models require `labels` key for computing loss

dataset = dataset.map(lambda x: {'labels': x['label']}, batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# 4. Train/Validation split

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# 5. Load model

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)
model.to(device)

# 6. Optimizer

optimizer = AdamW(model.parameters(), lr=5e-5)

# 7. Training and evaluation functions

def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        # Move all tensors to device
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

    avg_loss = total_loss / len(dataloader)
    print(f"Average training loss: {avg_loss:.4f}")
    return avg_loss

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()

            predictions = torch.argmax(outputs.logits, dim=-1)
            labels = batch['labels']
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    avg_loss = total_loss / len(dataloader)
    print(f"Validation loss: {avg_loss:.4f}, Validation accuracy: {accuracy:.4f}")
    return accuracy, avg_loss

# -------------------------
# 8. Full training loop
# -------------------------
epochs = 3  # adjust as needed
for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    train(model, train_loader, optimizer, device)
    evaluate(model, val_loader, device)


Using device: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 250/250 [00:41<00:00,  5.99it/s]


Average training loss: 0.3470


Evaluating: 100%|██████████| 63/63 [00:03<00:00, 18.59it/s]


Validation loss: 0.2939, Validation accuracy: 0.8730

Epoch 2/3


Training: 100%|██████████| 250/250 [00:42<00:00,  5.85it/s]


Average training loss: 0.1610


Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.54it/s]


Validation loss: 0.4715, Validation accuracy: 0.8650

Epoch 3/3


Training: 100%|██████████| 250/250 [00:43<00:00,  5.70it/s]


Average training loss: 0.0883


Evaluating: 100%|██████████| 63/63 [00:03<00:00, 17.22it/s]

Validation loss: 0.4544, Validation accuracy: 0.8970





In [25]:
# from tqdm import tqdm


# **Save the fine-tuned model and tokenizer**\\

In [26]:
import os


folder_path = '/content/sentiment_model'
os.makedirs(folder_path, exist_ok=True)

# Save your model
model.save_pretrained(folder_path)
tokenizer.save_pretrained(folder_path)


('/content/sentiment_model/tokenizer_config.json',
 '/content/sentiment_model/special_tokens_map.json',
 '/content/sentiment_model/vocab.txt',
 '/content/sentiment_model/added_tokens.json',
 '/content/sentiment_model/tokenizer.json')

In [35]:
!ls /content/sentiment_model  # This should list all the files


config.json	   special_tokens_map.json  tokenizer.json
model.safetensors  tokenizer_config.json    vocab.txt


In [27]:
import shutil

# Create a zip file of the model folder
shutil.make_archive('/content/sentiment_model', 'zip', '/content/sentiment_model')


'/content/sentiment_model.zip'

In [34]:
from google.colab import files

# Download the zip file to your local system
files.download('/content/sentiment_model.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [29]:
'''
import os

folder_path = "D:\\sentiment_analysis\\sentiment_analysis_amazon_reviews\\sentiment_model\\"

os.makedirs(folder_path, exist_ok=True)

model.save_pretrained(folder_path)
tokenizer.save_pretrained(folder_path)

print(f"Model and tokenizer saved successfully to {folder_path}")

'''

'\nimport os\n\nfolder_path = "D:\\sentiment_analysis\\sentiment_analysis_amazon_reviews\\sentiment_model\\"\n\nos.makedirs(folder_path, exist_ok=True)\n\nmodel.save_pretrained(folder_path)\ntokenizer.save_pretrained(folder_path)\n\nprint(f"Model and tokenizer saved successfully to {folder_path}")\n\n'

# **Test prediction in Jupyter**  

In [30]:
'''
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
import torch

folder_path = "D:\\sentiment_analysis\\sentiment_analysis_amazon_reviews\\sentiment_model"


# Load saved model & tokenizer
model = DistilBertForSequenceClassification.from_pretrained(folder_path)
tokenizer = DistilBertTokenizerFast.from_pretrained(folder_path)

model.eval()  # set to eval mode

# Example text
text = "This product is awesome!"

# Tokenize
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

# Predict
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits).item()

print("Predicted class:", predicted_class)
print(folder_path)

'''

'\nfrom transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast\nimport torch\n\nfolder_path = "D:\\sentiment_analysis\\sentiment_analysis_amazon_reviews\\sentiment_model"\n\n\n# Load saved model & tokenizer\nmodel = DistilBertForSequenceClassification.from_pretrained(folder_path)\ntokenizer = DistilBertTokenizerFast.from_pretrained(folder_path)\n\nmodel.eval()  # set to eval mode\n\n# Example text\ntext = "This product is awesome!"\n\n# Tokenize\ninputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)\n\n# Predict\nwith torch.no_grad():\n    outputs = model(**inputs)\n    logits = outputs.logits\n    predicted_class = torch.argmax(logits).item()\n\nprint("Predicted class:", predicted_class)\nprint(folder_path)\n\n'

# **Serve with FastAPI**.. to be written on cmd/bash

In [31]:
'''

from fastapi import FastAPI
from pydantic import BaseModel
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
import torch

app = FastAPI()

# Load model & tokenizer once at startup
model = DistilBertForSequenceClassification.from_pretrained("./sentiment_model")
tokenizer = DistilBertTokenizerFast.from_pretrained("./sentiment_model")
model.eval()

class TextIn(BaseModel):
    text: str

@app.post("/predict")
def predict_sentiment(data: TextIn):
    inputs = tokenizer(data.text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        pred = torch.argmax(logits).item()
    return {"prediction": pred}
'''

'\n\nfrom fastapi import FastAPI\nfrom pydantic import BaseModel\nfrom transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast\nimport torch\n\napp = FastAPI()\n\n# Load model & tokenizer once at startup\nmodel = DistilBertForSequenceClassification.from_pretrained("./sentiment_model")\ntokenizer = DistilBertTokenizerFast.from_pretrained("./sentiment_model")\nmodel.eval()\n\nclass TextIn(BaseModel):\n    text: str\n\n@app.post("/predict")\ndef predict_sentiment(data: TextIn):\n    inputs = tokenizer(data.text, return_tensors="pt", truncation=True, padding=True)\n    with torch.no_grad():\n        outputs = model(**inputs)\n        logits = outputs.logits\n        pred = torch.argmax(logits).item()\n    return {"prediction": pred}\n'

# **Run FastAPI server**

In [32]:
# uvicorn app:app --reload

# **Test the API**

In [33]:
# curl -X POST "http://127.0.0.1:8000/predict" -H "Content-Type: application/json" -d '{"text": "I love this product"}'