<a href="https://colab.research.google.com/github/yh250/TrasnToast/blob/main/TransToast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [4]:
# Custom dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Load the combined dataset
df = pd.read_csv('combined_sorted_1000.csv')

# Encode labels (assuming you want to use the second label column)
label_encoder = LabelEncoder()
df['sub'] = label_encoder.fit_transform(df['sub'])

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(df['Text'], df['sub'], test_size=0.2, random_state=42)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create datasets
train_dataset = SentimentDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, max_len=128)
val_dataset = SentimentDataset(val_texts.tolist(), val_labels.tolist(), tokenizer, max_len=128)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
class SentimentAnalysisModel(nn.Module):
    def __init__(self, bert_hidden_size, lstm_hidden_size, num_labels):
        super(SentimentAnalysisModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.lstm = nn.LSTM(bert_hidden_size, lstm_hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(lstm_hidden_size * 2, num_labels)  # * 2 for bidirectional

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():  # Freeze BERT parameters
            bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        lstm_out, _ = self.lstm(bert_output.last_hidden_state)
        logits = self.fc(lstm_out[:, -1, :])  # Use the output of the last time step
        return logits

# Initialize the model
model = SentimentAnalysisModel(bert_hidden_size=768, lstm_hidden_size=128, num_labels=len(label_encoder.classes_))
model = model.to(device)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [6]:
# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}')


Epoch 1/3, Loss: 0.6640
Epoch 2/3, Loss: 0.6034
Epoch 3/3, Loss: 0.5227


In [7]:
#Evaluating model performance
def evaluate(model, data_loader):
    model.eval()
    predictions = []
    actuals = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)

            predictions.extend(preds.cpu().numpy())
            actuals.extend(labels.cpu().numpy())

    accuracy = accuracy_score(actuals, predictions)
    return accuracy

# Evaluate on the validation set
val_accuracy = evaluate(model, val_loader)
print(f'Validation Accuracy: {val_accuracy:.4f}')


Validation Accuracy: 0.8850


In [14]:
#Running the model finally
def predict_sentiment(text, model, tokenizer, max_len=128):
    model.eval()

    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, prediction = torch.max(outputs, dim=1)

    return label_encoder.inverse_transform(prediction.cpu().numpy())[0]

# Example usage
#sample_text = "This is a sample comment for sentiment analysis."
user_input = input('Enter your text here :')
#predicted_label = predict_sentiment(sample_text, model, tokenizer)
predicted_label = predict_sentiment(user_input, model, tokenizer)
print(f'Predicted Label: {predicted_label}')


Enter your text here :you look like a puppy 
Predicted Label: RoastMe


In [16]:
import pickle

In [18]:
pickle.dump(model,open('/content/model_1000','wb'))