In [37]:
from typing import Any

import numpy as np
import pandas as pd
import  torch
import re

from pytorch_lightning.utilities.types import STEP_OUTPUT, OptimizerLRScheduler
from torch.special import logit
from torch.utils.data import Dataset,DataLoader
import pytorch_lightning as pl
from torch import optim
from torch.xpu import device
from torchmetrics.classification import Accuracy
import torch.nn as nn
from torchmetrics.functional import accuracy


In [38]:
device = torch.device('cuda' if torch.cuda.is_available()else 'cpu')

In [39]:
import warnings
warnings.filterwarnings("ignore")

In [40]:
train_path =r"E:\capstone_project_1\data\train (2).csv"
test_path = r"E:\capstone_project_1\data\test (2).csv"

In [41]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [42]:
def normalize(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]','',text)
    text = " ".join(text.split())
    return text


In [43]:
text = test_df.iloc[4]['review']
normalize(text)

'certainly not the best sushi in new york however it is always fresh and the place is very clean sterile'

In [44]:
def tokenize(text):
    tokens = text.split()
    return tokens


In [45]:
tokeni_tex = tokenize(text)

In [46]:
token_2_id = {
        '<PAD>':0,
        '<UNK>':1
    }
corpus = train_df["review"].tolist()
idx = 2
for text in corpus:
    text = normalize(text)
    token = tokenize(text)
    for token in token:
        if token not in token_2_id:
            token_2_id[token] = idx
            idx += 1

In [47]:
input = [token_2_id.get(token,token_2_id['<UNK>']) for token in tokeni_tex]
input

[1,
 52,
 3,
 151,
 249,
 72,
 1,
 1,
 1,
 962,
 1,
 47,
 30,
 393,
 644,
 1,
 63,
 3,
 223,
 30,
 35,
 744,
 1,
 1,
 1]

In [48]:
def build_vocab(text):
    token_2_id = {
        '<PAD>':0,
        '<UNK>':1
    }
    corpus = train_df["review"].tolist()
    idx = 2
    for text in corpus:
        text = normalize(text)
        token = tokenize(text)
        for token in token:
            if token not in token_2_id:
                token_2_id[token] = idx
                idx += 1
    return token_2_id

In [49]:
train_df.columns

Index(['review', 'aspect', 'sentiment'], dtype='str')

In [50]:
train_df.columns

Index(['review', 'aspect', 'sentiment'], dtype='str')

In [51]:
label_map = {
    "negative":0,
    "positive":1,
    "neutral":2,


}

In [52]:
class ABCDataset(Dataset):
    def __init__(self,df,token_2_id,label_map):
        self.df = df
        self.token_2_id = token_2_id
        self.label_mal = label_map
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        example = self.df.iloc[idx]
        text = example["review"]
        aspect = example["aspect"]
        sentiment =example["sentiment"]
        text_aspact_pair = text +  " " + aspect
        normalize_text = normalize(text_aspact_pair)
        tokens = tokenize(normalize_text)
        input_ids = [self.token_2_id.get(token,self.token_2_id["<UNK>"]) for token in tokens]
        label_id = self.label_mal[sentiment]
        return {
            "input_ids":input_ids,
            "label_ids":label_id
        }


In [62]:
class ABSADataModule(pl.LightningDataModule):
    def __init__(self,train_path,test_path,batch_size):
        super().__init__()
        self.train_path = train_path
        self.test_path = test_path
        self.batch_size = batch_size
    def setup(self,stage=None):
        train_df = pd.read_csv(self.train_path)

        test_df = pd.read_csv(self.test_path)
        self.token_2_id = build_vocab(train_df['review'])
        self.label_map = label_map
        self.train_set = ABCDataset(train_df,self.token_2_id,self.label_map)
        self.test_set = ABCDataset(test_df,self.token_2_id, self.label_map)
    def train_dataloader(self):
        return DataLoader(
            self.train_set,
            batch_size=self.batch_size,
            shuffle=True,
            collate_fn=self.collate_fn)
    def test_dataloader(self):
        return DataLoader(
            self.test_set,
            batch_size=self.batch_size,
            shuffle=False,
            collate_fn=self.collate_fn)

    def collate_fn(self,batch):
        batch_input_ids = [item['input_ids']for item in batch]
        batch_labels = [item['label_ids']for item in batch]
        max_len = max(len(input_ids) for input_ids in batch_input_ids)
        pad_token_id = self.token_2_id["<PAD>"]

        batch_padded_input_ids = [
            input_ids + [pad_token_id] * (max_len-len(input_ids))for input_ids in batch_input_ids
        ]
        return {
            "batch_input_ids":torch.tensor(batch_padded_input_ids,dtype=torch.long),
            "batch_label":torch.tensor(batch_labels,dtype=torch.long)
        }

In [63]:
data_module = ABSADataModule(
    train_path = train_path,
    test_path = test_path,

    batch_size=32
)
data_module.setup()

In [64]:
class ABSA(nn.Module):
    def __init__(self,vocab_size,num_labels = 3):
        super(ABSA,self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=vocab_size,embedding_dim=256)
        self.lstm_layer = nn.LSTM(input_size=256,hidden_size=512,batch_first=True)
        self.fc_layer = nn.Linear(in_features=512,out_features=num_labels)
    def forward(self,x):
        embeddings = self.embedding_layer(x)
        lstm_out,_ = self.lstm_layer(embeddings)
        logits = self.fc_layer(lstm_out[:,-1,:])
        return logits

In [76]:
class ABSAModel(pl.LightningModule):
    def __init__(self,vocab_size,num_labels=3):
        super().__init__()
        self.model = ABSA(vocab_size,num_labels)
        self.loss_fn = nn.CrossEntropyLoss()
        self.num_labels = num_labels
        self.save_hyperparameters()

    def forward(self,x):
        return self.model(x)
    def training_step(self,batch,batch_idx):
        input_ids = batch["batch_input_ids"].to(self.device)
        labels = batch['batch_label'].to(self.device)
        logits = self.forward(input_ids)
        loss = self.loss_fn(logits,labels)
        self.log("train_loss:",loss,prog_bar=True)
        acc = self.compute_metrics(logits,labels)
        self.log("train_acc:",acc,prog_bar=True)
        return loss

    def test_step(self,batch,batch_idx):
        input_ids = batch["batch_input_ids"].to(self.device)
        labels = batch['batch_label'].to(self.device)
        logits = self.forward(input_ids)
        loss = self.loss_fn(logits,labels)
        self.log("test_loss:",loss,prog_bar=True)
        acc = self.compute_metrics(logits,labels)
        self.log("test_acc:",acc,prog_bar=True)
        return loss
    def configure_optimizers(self):
        return optim.Adam(self.parameters(),lr = 3e-4)
    def compute_metrics(self,logits,labels):
        preds = torch.argmax(logits,dim=1)
        accuracy = Accuracy(task="multiclass",num_classes=self.num_labels)
        return accuracy(preds.cpu(),labels.cpu())



In [77]:
model = ABSAModel(
    vocab_size=len(data_module.token_2_id),
    num_labels=3
).to(device)

In [83]:
trainer = pl.Trainer(max_epochs=3)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
ðŸ’¡ Tip: For seamless cloud logging and experiment tracking, try installing [litlogger](https://pypi.org/project/litlogger/) to enable LitLogger, which logs metrics and artifacts automatically to the Lightning Experiments platform.


In [84]:
trainer.fit(model,data_module)

ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.

  | Name    | Type             | Params | Mode  | FLOPs
-------------------------------------------------------------
0 | model   | ABSA             | 2.6 M  | train | 0    
1 | loss_fn | CrossEntropyLoss | 0      | train | 0    
-------------------------------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.205    Total estimated model params size (MB)
5         Modules in train mode
0         Modules in eval mode
0         Total Flops


Epoch 2: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 113/113 [00:07<00:00, 14.68it/s, v_num=12, train_loss:=0.858, train_acc:=0.708]

`Trainer.fit` stopped: `max_epochs=3` reached.


Epoch 2: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 113/113 [00:07<00:00, 14.50it/s, v_num=12, train_loss:=0.858, train_acc:=0.708]


In [87]:
trainer.test(
    model,
    data_module
)


Testing DataLoader 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 35/35 [00:00<00:00, 59.13it/s]
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
       Test metric             DataLoader 0
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
        test_acc:            0.644325315952301
       test_loss:           0.8298655152320862
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

[{'test_loss:': 0.8298655152320862, 'test_acc:': 0.644325315952301}]

In [88]:
torch.save(model.model.state_dict(),"model_weights.pth")

In [89]:
score = trainer.test(
    model,
    data_module
)

Testing DataLoader 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 35/35 [00:00<00:00, 59.12it/s]
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
       Test metric             DataLoader 0
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
        test_acc:            0.644325315952301
       test_loss:           0.8298655152320862
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

In [95]:
score[0]["test_acc:"]

0.644325315952301

In [97]:
import mlflow as ml

In [99]:
ml.set_experiment(experiment_name="project1")

<Experiment: artifact_location='file:E:/capstone_project_1/mlruns/1', creation_time=1770827969584, experiment_id='1', last_update_time=1770827969584, lifecycle_stage='active', name='project1', tags={}>

In [103]:
import mlflow
import pandas as pd
import os

# 1. Setup Tracking URI and Experiment
# Using an absolute path ensures the UI always finds your data
tracking_uri = f"file:///{os.path.abspath('mlruns')}"
mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment("ABSA_Sentiment_Analysis")

# 2. Setup Lightning Logger
mlf_logger = pl.loggers.MLFlowLogger(
    experiment_name="ABSA_Sentiment_Analysis",
    tracking_uri=tracking_uri
)

# 3. Execution Block
with mlflow.start_run() as run:
    # Log manual parameters
    mlflow.log_params({
        "learning_rate": 3e-4,
        "batch_size": 32,
        "epochs": 3,
        "vocab_size": len(data_module.token_2_id)
    })

    # Initialize Trainer with the logger
    trainer = pl.Trainer(
        max_epochs=3,
        logger=mlf_logger,
        log_every_n_steps=10
    )

    # Train
    trainer.fit(model, data_module)

    # Test
    score = trainer.test(model, data_module)

    # Log Metrics from test results
    if score:
        mlflow.log_metric("final_test_acc", score[0].get("test_acc", 0))
        mlflow.log_metric("final_test_loss", score[0].get("test_loss", 0))

    # 4. Handle Sample Batch (Fixed dictionary conversion)
    test_loader = data_module.test_dataloader()
    sample_batch = next(iter(test_loader))

    # Move tensors to CPU and convert to Numpy
    sample_numpy = {
        k: v.cpu().numpy() if isinstance(v, torch.Tensor) else v
        for k, v in sample_batch.items()
    }

    # Log the Model itself to MLflow
    mlflow.pytorch.log_model(model.model, "absa_lstm_model")

    print("-" * 30)
    print(f"Run ID: {run.info.run_id}")
    print(f"Tracking URI: {mlflow.get_tracking_uri()}")
    print("-" * 30)
    print("To open the UI, run the following command in your terminal:")
    print(f"mlflow ui --backend-store-uri {mlflow.get_tracking_uri()}")

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
ðŸ’¡ Tip: For seamless cloud logging and experiment tracking, try installing [litlogger](https://pypi.org/project/litlogger/) to enable LitLogger, which logs metrics and artifacts automatically to the Lightning Experiments platform.
ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.

  | Name    | Type             | Params | Mode  | FLOPs
-------------------------------------------------------------
0 | model   | ABSA             | 2.6 M  | train | 0    
1 | loss_fn | CrossEntropyLoss | 0      | train | 0    
-------------------------------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.205    Total estimated model params size (MB)
5         Modules in train mode
0         Modules in eval mod

Epoch 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 113/113 [00:06<00:00, 16.22it/s, v_num=6e81, train_loss:=0.246, train_acc:=0.958]

FileNotFoundError: [WinError 161] The specified path is invalid: '///E:/'