In [1]:
!pip install transformers datasets evaluate optuna shap xgboost lightgbm pandas-profiling tqdm

Collecting visions==0.7.4 (from visions[type_image_path]==0.7.4->pandas-profiling)
  Using cached visions-0.7.4-py3-none-any.whl.metadata (5.9 kB)
Using cached visions-0.7.4-py3-none-any.whl (102 kB)
Installing collected packages: visions
  Attempting uninstall: visions
    Found existing installation: visions 0.8.1
    Uninstalling visions-0.8.1:
      Successfully uninstalled visions-0.8.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ydata-profiling 4.14.0 requires visions[type_image_path]<0.8.2,>=0.7.5, but you have visions 0.7.4 which is incompatible.[0m[31m
[0mSuccessfully installed visions-0.7.4


In [2]:
!pip install scikit-learn==1.3.2
!pip install evaluate



In [3]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import optuna
# Replace this deprecated import
# from datasets import load_metric
# With the current approach from evaluate library
import evaluate
import shap
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm as notebook_tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Set device and seeds
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(42)
np.random.seed(42)

In [5]:
# 1. Data Loading and Preparation
class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [6]:
# Load dataset (example with mock data)
df = pd.read_csv('/workspaces/Social-Media-Sentiment-Analysis-Trend-Forecasting/data/Tweets.csv')  # Replace with your dataset
df = df.drop_duplicates().dropna()

# Advanced text preprocessing
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'[\U00010000-\U0010ffff]', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.lower()

df['clean_text'] = df['text'].apply(clean_text)


In [7]:
!pip install textblob
from textblob import TextBlob
# Feature engineering
df['text_length'] = df['text'].apply(len)
df['num_hashtags'] = df['text'].apply(lambda x: len(re.findall(r'#\w+', x)))
df['sentiment_score'] = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)




In [8]:
!pip install ydata-profiling
# Import the ProfileReport class
from ydata_profiling import ProfileReport  # newer package name
# or use: from pandas_profiling import ProfileReport  # older package name

# 2. Advanced EDA
profile = ProfileReport(df, title="Preliminary Data Analysis")
profile.to_file("eda_report.html")

plt.figure(figsize=(12,6))
sns.histplot(df['sentiment_score'], kde=True)
plt.title('Sentiment Score Distribution')
plt.show()

Collecting visions<0.8.2,>=0.7.5 (from visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling)
  Using cached visions-0.8.1-py3-none-any.whl.metadata (11 kB)
Using cached visions-0.8.1-py3-none-any.whl (105 kB)
Installing collected packages: visions
  Attempting uninstall: visions
    Found existing installation: visions 0.7.4
    Uninstalling visions-0.7.4:
      Successfully uninstalled visions-0.7.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-profiling 3.2.0 requires visions[type_image_path]==0.7.4, but you have visions 0.8.1 which is incompatible.[0m[31m
[0mSuccessfully installed visions-0.8.1


Summarize dataset: 100%|██████████| 28/28 [00:01<00:00, 15.49it/s, Completed]                                    
Generate report structure:   0%|          | 0/1 [00:13<?, ?it/s]


KeyboardInterrupt: 

In [None]:
df.columns

Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone', 'clean_text', 'text_length',
       'num_hashtags', 'sentiment_score'],
      dtype='object')

In [None]:
# First, define the TweetDataset class
class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 3. Model Configuration
MODEL_NAME = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3).to(device)

# Use the correct column names from your DataFrame
sentiment_column = 'airline_sentiment'  # This is the actual sentiment column in your DataFrame
text_column = 'clean_text'  # This column exists in your DataFrame

# Convert sentiment labels to numerical values
# First, check unique values in sentiment column
print("Unique sentiment values:", df[sentiment_column].unique())

# Create a mapping dictionary for sentiment labels
sentiment_mapping = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
}

# Apply mapping to create a numerical label column
df['sentiment_label'] = df[sentiment_column].map(sentiment_mapping)

# Now split the data
X_train, X_val, y_train, y_val = train_test_split(
    df[text_column], 
    df['sentiment_label'], 
    test_size=0.2, 
    stratify=df['sentiment_label'],
    random_state=42
)

train_dataset = TweetDataset(X_train.tolist(), y_train.tolist(), tokenizer, 128)
val_dataset = TweetDataset(X_val.tolist(), y_val.tolist(), tokenizer, 128)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unique sentiment values: ['negative']


In [None]:
# 4. Hyperparameter Optimization with Optuna

# First, create a compute_metrics function that includes F1 score
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Load metrics from evaluate
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    # Compute F1 score with macro averaging for multi-class
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')
    
    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"]
    }

def objective(trial):
    args = TrainingArguments(
        output_dir='temp/',
        evaluation_strategy='epoch',
        save_strategy='epoch',
        learning_rate=trial.suggest_float("learning_rate", 1e-5, 5e-5),
        per_device_train_batch_size=trial.suggest_categorical("batch_size", [16, 32]),
        num_train_epochs=trial.suggest_int("epochs", 3, 6),
        weight_decay=trial.suggest_float("weight_decay", 0.0, 0.3),
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        push_to_hub=False,
        logging_dir='./logs',
        logging_steps=10
    )
    # Create a new model instance for each trial to avoid carrying over state
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3).to(device)
    
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,  # Add compute_metrics function
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )
    
    trainer.train()
    metrics = trainer.evaluate()
    
    # Return the F1 score (which will now be available thanks to compute_metrics)
    return metrics['eval_f1']


In [None]:
!pip install 'accelerate>=0.26.0' transformers[torch]



In [None]:
# First, define the compute_metrics function that includes F1 score
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Load metrics from evaluate
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    # Compute F1 score with macro averaging for multi-class
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='macro')
    
    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"]
    }

# After the installation, let's use a fixed set of hyperparameters
# rather than using Optuna (which would require a kernel restart)
learning_rate = 3e-5
batch_size = 16
epochs = 3
weight_decay = 0.1

# Set a random seed for reproducibility
import random
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

# Train a model with the manually set hyperparameters
training_args = TrainingArguments(
    output_dir='best_model/',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=weight_decay,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    push_to_hub=False,
    logging_dir='./logs',
    logging_steps=10
)

# Create a new model instance
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3).to(device)

# Create a trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Train the model
print("Starting model training...")
trainer.train()

# Evaluate the model
print("Evaluating model...")
metrics = trainer.evaluate()
print(f"Model metrics: {metrics}")

# Save the model
trainer.save_model("final_sentiment_model")
print("Model saved to 'final_sentiment_model' directory")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting model training...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.973809,0.0,0.0
2,No log,0.911056,0.0,0.0
3,No log,0.87776,1.0,1.0


Evaluating model...


Model metrics: {'eval_loss': 0.8777599334716797, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 1.2378, 'eval_samples_per_second': 0.808, 'eval_steps_per_second': 0.808, 'epoch': 3.0}
Model saved to 'final_sentiment_model' directory


In [11]:
# Apply mapping to create a numerical label column
df['sentiment_label'] = df[sentiment_column].map(sentiment_mapping)

# Split the data
from sklearn.model_selection import train_test_split

# Now split the data
X_train, X_val, y_train, y_val = train_test_split(
    df[text_column], 
    df['sentiment_label'], 
    test_size=0.2, 
    stratify=df['sentiment_label'],
    random_state=42
)

print("Data split complete:")
print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Class distribution in training: {y_train.value_counts().to_dict()}")
# 5. Alternative Approach: Memory-Efficient Models
print("Using memory-efficient models instead of BERT...")

# 5.1 TF-IDF + Linear Model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Create TF-IDF + Logistic Regression pipeline
print("Training TF-IDF + Logistic Regression model...")
tfidf_model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(C=1, max_iter=1000, random_state=42, n_jobs=-1))
])

# Train the model
tfidf_model.fit(X_train, y_train)

# Make predictions
tfidf_preds = tfidf_model.predict(X_val)
tfidf_probs = tfidf_model.predict_proba(X_val)

# Calculate metrics
tfidf_accuracy = accuracy_score(y_val, tfidf_preds)
tfidf_f1 = f1_score(y_val, tfidf_preds, average='macro')

print(f"TF-IDF Model Accuracy: {tfidf_accuracy:.4f}")
print(f"TF-IDF Model Macro F1: {tfidf_f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, tfidf_preds, target_names=['Negative', 'Neutral', 'Positive']))

# 5.2 Gradient Boosting Model
print("\nTraining LightGBM model...")
import lightgbm as lgb

# First, create TF-IDF features
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

# Convert to LightGBM dataset format
lgb_train = lgb.Dataset(X_train_tfidf, y_train)
lgb_val = lgb.Dataset(X_val_tfidf, y_val, reference=lgb_train)

# LightGBM parameters
params = {
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1
}

# Train LightGBM model
print("Training LightGBM model...")
gbm_model = lgb.train(
    params,
    lgb_train,
    num_boost_round=500,
    valid_sets=[lgb_val],
    early_stopping_rounds=20,
    verbose_eval=100
)

# Make predictions
gbm_preds = gbm_model.predict(X_val_tfidf)
gbm_preds_labels = gbm_preds.argmax(axis=1)

# Calculate metrics
gbm_accuracy = accuracy_score(y_val, gbm_preds_labels)
gbm_f1 = f1_score(y_val, gbm_preds_labels, average='macro')

print(f"LightGBM Model Accuracy: {gbm_accuracy:.4f}")
print(f"LightGBM Model Macro F1: {gbm_f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, gbm_preds_labels, target_names=['Negative', 'Neutral', 'Positive']))

# 5.3 Feature Importance Analysis
print("\nAnalyzing feature importance...")

# Get top features from LightGBM
feature_importance = gbm_model.feature_importance()
feature_names = tfidf.get_feature_names_out()

# Sort features by importance
sorted_idx = feature_importance.argsort()[::-1]
top_n = 20

# Display top N important features
print(f"\nTop {top_n} important features:")
for i in range(min(top_n, len(sorted_idx))):
    print(f"{feature_names[sorted_idx[i]]}: {feature_importance[sorted_idx[i]]}")

# Plot feature importance
plt.figure(figsize=(12, 8))
plt.barh(range(top_n), feature_importance[sorted_idx][:top_n])
plt.yticks(range(top_n), [feature_names[i] for i in sorted_idx[:top_n]])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Top Features for Sentiment Classification')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.show()

# 5.4 Save the models
import pickle

print("\nSaving models...")
with open('tfidf_model.pkl', 'wb') as f:
    pickle.dump(tfidf_model, f)
    
gbm_model.save_model('lightgbm_model.txt')

print("Models saved successfully!")

# 5.5 Metrics comparison
models = ['TF-IDF+LogReg', 'LightGBM']
accuracy_scores = [tfidf_accuracy, gbm_accuracy]
f1_scores = [tfidf_f1, gbm_f1]

plt.figure(figsize=(10, 6))
x = range(len(models))
width = 0.35
plt.bar(x, accuracy_scores, width, label='Accuracy')
plt.bar([i + width for i in x], f1_scores, width, label='F1 Score')
plt.ylabel('Score')
plt.title('Model Performance Comparison')
plt.xticks([i + width/2 for i in x], models)
plt.ylim(0, 1)
plt.legend()
plt.tight_layout()
plt.savefig('model_comparison.png')
plt.show()

print("Model Performance Summary:")
print(f"TF-IDF+LogReg - Accuracy: {tfidf_accuracy:.4f}, F1: {tfidf_f1:.4f}")
print(f"LightGBM - Accuracy: {gbm_accuracy:.4f}, F1: {gbm_f1:.4f}")

NameError: name 'sentiment_column' is not defined

In [None]:
# 5. Final Model Training

# Check if 'study' exists, otherwise use default parameters
try:
    best_params = study.best_params
    print("Using optimized hyperparameters:", best_params)
except NameError:
    print("Hyperparameter optimization not found. Using default parameters.")
    best_params = {
        "learning_rate": 3e-5,
        "batch_size": 16, 
        "epochs": 3,
        "weight_decay": 0.1
    }
    print("Default hyperparameters:", best_params)

final_args = TrainingArguments(
    output_dir='best_model/',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=best_params["learning_rate"],
    per_device_train_batch_size=best_params["batch_size"],
    num_train_epochs=best_params["epochs"],
    weight_decay=best_params["weight_decay"],
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    push_to_hub=False,
)

final_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3).to(device)

final_trainer = Trainer(
    model=final_model,
    args=final_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

print("Training final model...")
final_trainer.train()
final_metrics = final_trainer.evaluate()
print(f"Final model metrics: {final_metrics}")

Hyperparameter optimization not found. Using default parameters.
Default hyperparameters: {'learning_rate': 3e-05, 'batch_size': 16, 'epochs': 3, 'weight_decay': 0.1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training final model...


: 

In [None]:
# 6. Model Interpretability with SHAP
print("Generating SHAP values for model interpretability...")

# Create a function to get model predictions
def model_predict(texts):
    # Tokenize the input texts
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)
    # Get model predictions
    with torch.no_grad():
        outputs = final_model(**inputs)
    # Return logits
    return outputs.logits.cpu().numpy()

# Select a subset of validation data for SHAP analysis (for efficiency)
shap_examples = X_val.sample(100).tolist()

# Initialize SHAP explainer
explainer = shap.Explainer(model_predict, tokenizer)

# Calculate SHAP values
shap_values = explainer(shap_examples)

# Visualize SHAP values for a few examples
plt.figure(figsize=(20, 6))
shap.plots.text(shap_values[:10], display=False)
plt.tight_layout()
plt.savefig('shap_values_text.png')
plt.show()

In [None]:
# 8. Advanced Evaluation
print("\n8. Advanced Evaluation")

# Generate predictions for the validation set
val_preds = final_trainer.predict(val_dataset)
val_pred_labels = np.argmax(val_preds.predictions, axis=-1)
val_true_labels = y_val.tolist()

# Confusion matrix
cm = confusion_matrix(val_true_labels, val_pred_labels)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')
plt.show()

In [None]:
# 9. Error Analysis
print("\n9. Error Analysis")

# Find misclassified examples
misclassified_indices = np.where(val_pred_labels != val_true_labels)[0]
misclassified_texts = X_val.iloc[misclassified_indices].tolist()
misclassified_true = [val_true_labels[i] for i in misclassified_indices]
misclassified_pred = [val_pred_labels[i] for i in misclassified_indices]

# Show some misclassified examples
print("\nMisclassified Examples:")
for i in range(min(5, len(misclassified_texts))):
    print(f"Text: {misclassified_texts[i]}")
    print(f"True: {['Negative', 'Neutral', 'Positive'][misclassified_true[i]]}")
    print(f"Predicted: {['Negative', 'Neutral', 'Positive'][misclassified_pred[i]]}")
    print("---")

# Analyze common patterns in misclassifications
print("\nAnalyzing common patterns in misclassifications...")
# Length analysis
misclassified_lengths = [len(text.split()) for text in misclassified_texts]
correctly_classified_indices = np.where(val_pred_labels == val_true_labels)[0]
correctly_classified_texts = X_val.iloc[correctly_classified_indices].tolist()
correctly_classified_lengths = [len(text.split()) for text in correctly_classified_texts]

plt.figure(figsize=(12, 6))
plt.hist([misclassified_lengths, correctly_classified_lengths], bins=20, 
         alpha=0.7, label=['Misclassified', 'Correctly Classified'])
plt.xlabel('Text Length (words)')
plt.ylabel('Count')
plt.title('Distribution of Text Length for Misclassified vs. Correctly Classified Examples')
plt.legend()
plt.savefig('error_analysis_length.png')
plt.show()

In [None]:
# 10. Model Comparison
print("\n10. Model Comparison")

# Train a simpler baseline model for comparison
# Using a simple TF-IDF + Logistic Regression approach
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

print("Training baseline model (TF-IDF + Logistic Regression)...")
baseline_model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

baseline_model.fit(X_train, y_train)
baseline_preds = baseline_model.predict(X_val)

# Compare performance
baseline_accuracy = accuracy_score(y_val, baseline_preds)
bert_accuracy = final_metrics['eval_accuracy']

print(f"Baseline model accuracy: {baseline_accuracy:.4f}")
print(f"BERT model accuracy: {bert_accuracy:.4f}")
print(f"Improvement: {(bert_accuracy - baseline_accuracy) * 100:.2f}%")


In [None]:
# 11. Time Series Analysis
print("\n11. Time Series Analysis")
# Check if the dataframe has a timestamp column
if 'tweet_created' in df.columns:
    # Convert to datetime if needed
    df['tweet_created'] = pd.to_datetime(df['tweet_created'])
    
    # Aggregate sentiments by date
    df['date'] = df['tweet_created'].dt.date
    sentiment_by_date = df.groupby('date')['sentiment_label'].value_counts().unstack().fillna(0)
    
    # Plot sentiment trends over time
    plt.figure(figsize=(14, 7))
    sentiment_by_date.plot(kind='line', ax=plt.gca())
    plt.title('Sentiment Trends Over Time')
    plt.xlabel('Date')
    plt.ylabel('Number of Tweets')
    plt.legend(['Negative', 'Neutral', 'Positive'])
    plt.tight_layout()
    plt.savefig('sentiment_time_series.png')
    plt.show()
else:
    print("No timestamp column available for time series analysis.")


In [None]:
# 12. Save models
print("\n12. Saving Models")
# Save the final BERT model
final_trainer.save_model("final_sentiment_model")

# Save the baseline model
import pickle
with open("baseline_model.pkl", "wb") as f:
    pickle.dump(baseline_model, f)

print("Models saved successfully.")

In [None]:
# 13. Advanced Monitoring Setup
print("\n13. Advanced Monitoring Setup")
print("Setting up inference monitoring to track model performance...")

# Create a function to simulate monitoring the model in production
def monitor_inference(model, text, true_label=None):
    """Simulate model monitoring in production"""
    # Record the inference time
    start_time = time.time()
    
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    
    # Make prediction
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get prediction
    prediction = torch.argmax(outputs.logits, dim=1).item()
    
    # Record the time taken
    inference_time = time.time() - start_time
    
    # If true label is provided, record accuracy
    accuracy = None
    if true_label is not None:
        accuracy = 1 if prediction == true_label else 0
    
    return {
        "prediction": prediction,
        "inference_time": inference_time,
        "accuracy": accuracy
    }

# Demonstrate monitoring on a few examples
monitoring_examples = X_val.iloc[:5].tolist()
monitoring_labels = y_val.iloc[:5].tolist()

for i, (text, label) in enumerate(zip(monitoring_examples, monitoring_labels)):
    result = monitor_inference(final_model, text, label)
    print(f"Example {i+1}:")
    print(f"  Prediction: {['Negative', 'Neutral', 'Positive'][result['prediction']]}")
    print(f"  True label: {['Negative', 'Neutral', 'Positive'][label]}")
    print(f"  Inference time: {result['inference_time']*1000:.2f} ms")
    print(f"  Correct: {'Yes' if result['accuracy'] == 1 else 'No'}")
    print()

In [None]:
# 14. Cost Analysis
print("\n14. Cost Analysis")
print("Performing cost analysis for model deployment...")

# Simulate cost analysis for model deployment
def estimate_cost(model_size_mb, requests_per_day, cost_per_inference=0.0001, cost_per_gb_hour=0.5, hours_per_day=24):
    """Estimate cost for model deployment"""
    # Calculate inference cost per day
    daily_inference_cost = requests_per_day * cost_per_inference
    
    # Calculate storage and compute cost
    storage_cost = (model_size_mb / 1000) * cost_per_gb_hour * hours_per_day
    
    # Total cost per day
    total_daily_cost = daily_inference_cost + storage_cost
    
    # Monthly cost
    monthly_cost = total_daily_cost * 30
    
    return {
        "daily_inference_cost": daily_inference_cost,
        "daily_storage_cost": storage_cost,
        "total_daily_cost": total_daily_cost,
        "total_monthly_cost": monthly_cost
    }

# Estimate model size
model_size_mb = 500  # Approximate size of BERT base model in MB
requests_per_day = 10000  # Hypothetical number of requests per day

cost_estimate = estimate_cost(model_size_mb, requests_per_day)

print(f"Daily inference cost: ${cost_estimate['daily_inference_cost']:.2f}")
print(f"Daily storage cost: ${cost_estimate['daily_storage_cost']:.2f}")
print(f"Total daily cost: ${cost_estimate['total_daily_cost']:.2f}")
print(f"Total monthly cost: ${cost_estimate['total_monthly_cost']:.2f}")

# 15. Documentation and Reporting
print("\n15. Documentation and Reporting")
print("Generating project documentation and report...")


In [None]:
# Generate a simple Markdown report
report = """
# Sentiment Analysis Project Report

## Project Overview
This project implements advanced sentiment analysis using BERT and traditional ML models.

## Data Analysis
- Dataset size: {data_size} tweets
- Class distribution: {class_distribution}

## Model Performance
- BERT model accuracy: {bert_accuracy:.4f}
- Baseline model accuracy: {baseline_accuracy:.4f}
- Performance improvement: {improvement:.2f}%

## Key Findings
- The BERT model significantly outperforms the baseline model
- Misclassifications tend to happen more with {misclass_trend}
- Sentiment trends show {sentiment_trend}

## Deployment Considerations
- Estimated monthly cost: ${monthly_cost:.2f}
- Average inference time: {avg_inference_time:.2f} ms

## Recommendations
1. Consider fine-tuning the model further with additional data
2. Implement a monitoring system to detect performance degradation
3. Regular retraining to adapt to changing language patterns
""".format(
    data_size=len(df),
    class_distribution=df['sentiment_label'].value_counts().to_dict(),
    bert_accuracy=bert_accuracy,
    baseline_accuracy=baseline_accuracy,
    improvement=(bert_accuracy - baseline_accuracy) * 100,
    misclass_trend="shorter texts" if np.mean(misclassified_lengths) < np.mean(correctly_classified_lengths) else "longer texts",
    sentiment_trend="variations over time" if 'tweet_created' in df.columns else "not analyzed (no time data)",
    monthly_cost=cost_estimate['total_monthly_cost'],
    avg_inference_time=np.mean([monitor_inference(final_model, text)['inference_time'] * 1000 for text in X_val.iloc[:10].tolist()])
)

# Save the report to a file
with open("sentiment_analysis_report.md", "w") as f:
    f.write(report)

print("Report saved to sentiment_analysis_report.md")
print("\nProject complete!")