In [21]:
import pandas as pd
import numpy as np
import re
from transformers import pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim

# STEP 1: Load Dataset from Local File
file_path = "C:\\Users\\Admin\\Downloads\\YoutubeCommentsDataSet.csv"  # Path to your downloaded file
df = pd.read_csv(file_path)

# STEP 2: Inspect the columns to find the correct column for comments
print(df.columns)

# STEP 3: Clean the Comments
def clean_text(text):
    text = str(text)
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^A-Za-z\s]", "", text)  # Remove non-alphabetical characters
    return text.lower().strip()  # Convert to lowercase and remove leading/trailing spaces

# Apply the clean_text function to the 'Comment' column
df['cleaned_comment'] = df['Comment'].apply(clean_text)

# STEP 4: Run Sentiment Analysis using BERT
sentiment_pipeline = pipeline("sentiment-analysis")

# Mapping of BERT output to three labels (Positive, Negative, Neutral)
sentiments = []
for comment in df['cleaned_comment'].dropna().tolist()[:100]:  # Limit to 100 for performance
    result = sentiment_pipeline(comment)[0]
    label = result['label']
    score = result['score']
    
    if label == 'LABEL_0':  # Negative sentiment
        sentiment = 0  # Negative
    elif label == 'LABEL_1':  # Neutral sentiment
        sentiment = 1  # Neutral
    else:  # Positive sentiment
        sentiment = 2  # Positive
    
    sentiments.append(sentiment)

sentiment_df = pd.DataFrame({
    'comment': df['cleaned_comment'].dropna().tolist()[:100],
    'sentiment_score': sentiments
})

# STEP 5: Simulate QoS Features
np.random.seed(42)
n = len(sentiment_df)
qos_data = pd.DataFrame({
    'bitrate': np.random.randint(1000, 4000, size=n),
    'buffering': np.random.uniform(0.2, 3.0, size=n),
    'resolution': np.random.choice([480, 720, 1080], size=n),
    'viewers': np.random.randint(100, 5000, size=n),
})

# Combine with sentiment scores
final_data = pd.concat([qos_data.reset_index(drop=True), sentiment_df['sentiment_score']], axis=1)

# STEP 6: Prepare Data for Training (Handling three classes)
labels = sentiment_df['sentiment_score']  # This is now a 3-class label

# STEP 7: Prepare Data for Neural Network
X = final_data
y = labels

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)  # Change to long for multi-class classification
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)  # Change to long for multi-class classification

# STEP 8: Define Deep Neural Network Model for Multi-Class Classification
class SatisfactionModel(nn.Module):
    def __init__(self, input_dim):
        super(SatisfactionModel, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 3),  # Output 3 classes (for Positive, Negative, Neutral)
            nn.Softmax(dim=1)  # Softmax for multi-class classification
        )

    def forward(self, x):
        return self.net(x)

model = SatisfactionModel(input_dim=X.shape[1])
criterion = nn.CrossEntropyLoss()  # Using CrossEntropyLoss for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

# STEP 9: Train the Model
for epoch in range(50):
    model.train()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch} - Loss: {loss.item():.4f}")

# STEP 10: Evaluate
model.eval()
with torch.no_grad():
    predictions = model(X_test_tensor).argmax(dim=1)  # Use argmax to get the predicted class
    accuracy = (predictions == y_test_tensor).float().mean()
    print(f"\n✅ Test Accuracy: {accuracy.item():.2f}")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Index(['Comment', 'Sentiment'], dtype='object')


Device set to use cpu


Epoch 0 - Loss: 1.1805
Epoch 10 - Loss: 1.1656
Epoch 20 - Loss: 1.1459
Epoch 30 - Loss: 1.1247
Epoch 40 - Loss: 1.0975

✅ Test Accuracy: 0.70


In [22]:
torch.save(model.state_dict(), 'sentiment_model.pth')  # Save model weights
print("Model saved successfully!")

Model saved successfully!


In [23]:
# STEP 12: Load the Saved Model
# Reinitialize the model
model = SatisfactionModel(input_dim=X.shape[1])

# Load the saved weights
model.load_state_dict(torch.load('sentiment_model.pth'))
model.eval()  # Set the model to evaluation mode

print("Model loaded successfully!")

Model loaded successfully!


In [25]:
def predict_sentiment_with_model(comment):
    # Clean the new comment
    cleaned_comment = clean_text(comment)
    
    # Use the BERT model for sentiment analysis (for comment sentiment score)
    sentiment_pipeline = pipeline("sentiment-analysis")
    result = sentiment_pipeline(cleaned_comment)[0]
    label = result['label']
    score = result['score']
    
    # Map BERT output to sentiment label
    if label == "POSITIVE":
        sentiment_score = 2  # Positive
    elif label == "NEGATIVE":
        sentiment_score = 0  # Negative
    else:
        sentiment_score = 1  # Neutral
    
    # Simulate QoS data for the new comment (or replace with real QoS data)
    qos_test_data = np.random.rand(1, 4)  # Assuming 4 features like bitrate, buffering, resolution, viewers
    
    # Add the sentiment score as a new feature
    qos_test_data_with_sentiment = np.hstack([qos_test_data, np.array([[sentiment_score]])])  # Adding sentiment score
    
    # Use the same scaler as during training
    qos_test_data_with_sentiment = scaler.transform(qos_test_data_with_sentiment)  # Now we have 5 features
    
    # Convert to torch tensor
    input_tensor = torch.tensor(qos_test_data_with_sentiment, dtype=torch.float32)
    
    # Get prediction from the trained model
    model.eval()
    with torch.no_grad():
        output = model(input_tensor)
        predicted_class = output.argmax(dim=1).item()  # Get the class with the highest probability
    
    # Map predicted class to the corresponding sentiment label
    sentiment_labels = ['Negative', 'Neutral', 'Positive']
    predicted_sentiment = sentiment_labels[predicted_class]
    
    return predicted_sentiment, score

# Example usage for prediction:
new_comment = "I didn't like this movie"
predicted_sentiment, score = predict_sentiment_with_model(new_comment)
print(f"Predicted Sentiment: {predicted_sentiment} with confidence score: {score:.2f}")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Predicted Sentiment: Negative with confidence score: 0.96


