In [17]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/again1-processed-data/Processed.csv


In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
import torch
import torch.nn as nn
import numpy as np
import mlflow
import mlflow.pytorch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [3]:
import os
import warnings
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
import mlflow.pytorch
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn and imblearn imports
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier



# NLTK, Gensim and Hugging Face imports for text processing & LDA / BERT
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
from gensim import corpora

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

# Suppress warnings
warnings.filterwarnings('ignore')

###############################
# MLflow Setup Functions
###############################
def create_mlflow_directory():
    """Create MLflow directory in current working directory"""
    try:
        current_dir = os.getcwd()
        mlruns_dir = os.path.join(current_dir, 'mlruns')
        os.makedirs(mlruns_dir, exist_ok=True)
        return mlruns_dir
    except Exception as e:
        print(f"Error creating MLflow directory: {str(e)}")
        return None

def setup_mlflow(mlruns_dir, experiment_name):
    """Setup MLflow tracking URI and experiment"""
    try:
        mlflow.set_tracking_uri(f"file:{mlruns_dir}")
        experiment = mlflow.get_experiment_by_name(experiment_name)
        if experiment is None:
            mlflow.create_experiment(experiment_name)
        mlflow.set_experiment(experiment_name)
        print(f"MLflow Tracking URI: {mlflow.get_tracking_uri()}")
        print(f"Experiment: {experiment_name}")
        return True
    except Exception as e:
        print(f"Error setting up MLflow: {str(e)}")
        return False

###############################
# Data Loading and Preprocessing
###############################
def load_data(file_path):
    try:
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Data file not found at: {file_path}")
        df = pd.read_csv(file_path)
        print("Dataset loaded. Shape:", df.shape)
        # Adjust based on your dataset; drop or rename columns as needed.
        # For example, if there is an ID column, drop it:
        if "ID" in df.columns:
            df.drop(columns=["ID"], inplace=True)
        return df
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return None

In [4]:
###############################
# Model 1: LDA-based Feature + RF with SMOTE
###############################
def run_model_1(df, target_col):
    # Preprocess text for LDA
    stop_words = set(stopwords.words('english'))
    def preprocess_text(text):
        tokens = word_tokenize(text.lower())
        tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
        return tokens

    df['tokens'] = df['Cleaned_Feedback'].apply(preprocess_text)
    dictionary = corpora.Dictionary(df['tokens'])
    corpus = [dictionary.doc2bow(text) for text in df['tokens']]
    num_topics = 3
    lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=42)
    def get_dominant_topic(doc_bow):
        topics = lda_model.get_document_topics(doc_bow)
        return max(topics, key=lambda x: x[1])[0]
    df['Feedback_topic'] = [get_dominant_topic(bow) for bow in corpus]
    print("LDA topics assigned. Distribution:")
    print(df['Feedback_topic'].value_counts())
    
    # Prepare structured data features (exclude text and tokens)
    exclude_cols = ["Cleaned_Feedback", "tokens", target_col]
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    for col in feature_cols:
        if df[col].dtype == 'bool':
            df[col] = df[col].astype(int)
    X = pd.get_dummies(df[feature_cols], drop_first=True)
    y = df[target_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Apply SMOTE
    sm = SMOTE(random_state=42)
    X_train_res, y_train_res = sm.fit_resample(X_train_scaled, y_train)
    print("After SMOTE, counts:")
    print(pd.Series(y_train_res).value_counts())
    
    # Train RandomForest model
    rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
    
    with mlflow.start_run(run_name="Model_1_RF_LDA_SMOTE"):
        mlflow.log_param("model", "RandomForest with LDA_Features")
        rf_clf.fit(X_train_res, y_train_res)
        y_pred = rf_clf.predict(X_test_scaled)
        acc = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", acc)
        mlflow.sklearn.log_model(rf_clf, "rf_model")
        report = classification_report(y_test, y_pred)
        mlflow.log_text(report, "classification_report.txt")
        print("Model 1 Accuracy:", acc)
        print(report)

In [7]:
###############################
# Model 2: bert-base-uncased for Text Embeddings + Structured Data
###############################
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

def balance_dataset(df, target):
    print("Before Balancing:", Counter(df[target]))

    # If target is categorical, apply oversampling
    if df[target].dtype == 'object':
        # Convert categorical labels to numerical
        df[target] = df[target].astype('category').cat.codes
    
    # Separate structured features and target
    structured_cols = df.drop(columns=['Cleaned_Feedback', target]).columns
    X_structured = df[structured_cols]
    y = df[target]

    # Apply SMOTE for structured numerical data
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_structured, y)

    # Convert back to dataframe
    df_resampled = pd.DataFrame(X_resampled, columns=structured_cols)
    df_resampled[target] = y_resampled

    # Re-add the text column by randomly duplicating text from existing samples
    text_upsampled = resample(df['Cleaned_Feedback'], replace=True, n_samples=len(df_resampled), random_state=42)
    df_resampled['Cleaned_Feedback'] = text_upsampled.reset_index(drop=True)

    print("After Balancing:", Counter(df_resampled[target]))
    return df_resampled

def run_model_4(df, target):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Balance the dataset before splitting
    df_balanced = balance_dataset(df, target)

    df_train, df_test = train_test_split(df_balanced, test_size=0.2, random_state=42)
    train_dataset = CombinedDataset(df_train.reset_index(drop=True), tokenizer, target)
    test_dataset = CombinedDataset(df_test.reset_index(drop=True), tokenizer, target)

    BATCH_SIZE = 16
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    structured_dim = df_balanced.drop(columns=['Cleaned_Feedback', target]).shape[1]
    n_classes = df_balanced[target].nunique()
    
    model = CombinedModel(structured_dim=structured_dim, n_classes=n_classes)
    model = model.to(device)

    optimizer = AdamW(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()
    epochs = 3
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    # Training loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            structured = batch["structured"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, structured=structured)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
        
        print(f"Model 4 - Epoch {epoch+1}/{epochs} Loss: {total_loss/len(train_loader):.4f}")

    # Evaluation
    model.eval()
    preds = []
    true_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            structured = batch["structured"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, structured=structured)
            _, predicted = torch.max(outputs, dim=1)
            preds.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    acc = np.mean(np.array(preds) == np.array(true_labels))

    with mlflow.start_run(run_name="Model_4_BERT_Structured"):
        mlflow.log_param("model", "Combined BERT + Structured")
        mlflow.log_metric("accuracy", acc)
        mlflow.pytorch.log_model(model, "combined_model")
        report = classification_report(true_labels, preds)
        mlflow.log_text(report, "classification_report.txt")
        print("Model 4 Accuracy:", acc)
        print(report)


In [8]:
###############################
# Model 3: Hospital Reviews Model for Text Embeddings + Structured Data
###############################
# We'll use brettclaus/Hospital_Reviews to obtain text embeddings

class CombinedDataset(Dataset):
    def __init__(self, df, tokenizer, target, max_len=64):
        # Drop the text column and target from structured features
        self.structured = df.drop(columns=['Cleaned_Feedback', target]).values.astype(np.float32)
        self.texts = df['Cleaned_Feedback'].tolist()
        self.labels = df[target].values
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            "structured": torch.tensor(self.structured[idx]),
            "input_ids": inputs["input_ids"].flatten(),
            "attention_mask": inputs["attention_mask"].flatten(),
            "labels": torch.tensor(self.labels[idx])
        }

class CombinedModel(nn.Module):
    def __init__(self, structured_dim, n_classes, dropout_prob=0.3):
        super(CombinedModel, self).__init__()
        # Use the brettclaus/Hospital_Reviews model instead of bert-base-uncased
        self.hospital_model = AutoModelForSequenceClassification.from_pretrained("brettclaus/Hospital_Reviews")
        # For embeddings, we use the underlying model from the loaded model.
        # Note: The output dimension is typically available from config.hidden_size.
        self.text_model = self.hospital_model.base_model  # This gives the underlying transformer model
        text_output_dim = self.hospital_model.config.hidden_size
        
        # Structured branch
        self.fc_structured = nn.Linear(structured_dim, 32)
        self.dropout = nn.Dropout(dropout_prob)
        # Final classification head
        self.fc = nn.Linear(text_output_dim + 32, n_classes)
        
    def forward(self, input_ids, attention_mask, structured):
        # Get text embeddings from the hospital reviews model
        # For many Hugging Face models, pooler_output is used as the [CLS] token representation.
        outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        
        structured_out = torch.relu(self.fc_structured(structured))
        combined = torch.cat((cls_output, structured_out), dim=1)
        combined = self.dropout(combined)
        return self.fc(combined)

def run_model_5(df, target):
    # Use the tokenizer associated with brettclaus/Hospital_Reviews
    tokenizer = AutoTokenizer.from_pretrained("brettclaus/Hospital_Reviews")
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
    train_dataset = CombinedDataset(df_train.reset_index(drop=True), tokenizer, target)
    test_dataset = CombinedDataset(df_test.reset_index(drop=True), tokenizer, target)
    
    BATCH_SIZE = 16
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    structured_dim = df.drop(columns=['Cleaned_Feedback', target]).shape[1]
    n_classes = df[target].nunique()
    
    model = CombinedModel(structured_dim=structured_dim, n_classes=n_classes)
    model = model.to(device)
    
    optimizer = AdamW(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()
    epochs = 3
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    
    # Training loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            structured = batch["structured"].to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, structured=structured)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
        print(f"Model 4 - Epoch {epoch+1}/{epochs} Loss: {total_loss/len(train_loader):.4f}")
    
    # Evaluation
    model.eval()
    preds = []
    true_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            structured = batch["structured"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, structured=structured)
            _, predicted = torch.max(outputs, dim=1)
            preds.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    acc = np.mean(np.array(preds) == np.array(true_labels))
    
    with mlflow.start_run(run_name="Model_4_HospitalReviews_Structured"):
        mlflow.log_param("model", "Combined brettclaus/Hospital_Reviews + Structured")
        mlflow.log_metric("accuracy", acc)
        # Log the PyTorch model with MLflow
        mlflow.pytorch.log_model(model, "combined_model")
        report = classification_report(true_labels, preds)
        mlflow.log_text(report, "classification_report.txt")
        print("Model 4 Accuracy:", acc)
        print(report)


In [9]:
###############################
# Main Execution with MLflow Integration
###############################
def main():
    mlruns_dir = create_mlflow_directory()
    if not mlruns_dir:
        return
    experiment_name = "Multi_Model_Integration"
    if not setup_mlflow(mlruns_dir, experiment_name):
        return
    # Adjust the data path as needed
    data_path = "/kaggle/input/again1-processed-data/Processed.csv"
    df = load_data(data_path)
    if df is None:
        return
    
    target_col = "Sentiment"  # Adjust as needed
    
    # Run each model and track with MLflow
    print("\nRunning Model 1:")
    run_model_1(df.copy(), target_col)
    
    #print("\nRunning Model 2:")
    #run_model_2(df.copy(), target_col)
    
    #print("\nRunning Model 3:")
    #run_model_3(df.copy(), target_col)
    
    print("\nRunning Model 4:")
    run_model_4(df.copy(), target_col)

    print("\nRunning Model 5:")
    run_model_5(df.copy(), target_col)
    
    print("\nAll models executed. To view MLflow UI, run:")
    print(f"mlflow ui --backend-store-uri file:{mlruns_dir}")

if __name__ == "__main__":
    main()

MLflow Tracking URI: file:/kaggle/working/mlruns
Experiment: Multi_Model_Integration
Dataset loaded. Shape: (1000, 11)

Running Model 1:
LDA topics assigned. Distribution:
Feedback_topic
2    426
0    338
1    236
Name: count, dtype: int64
After SMOTE, counts:
Sentiment
1    383
2    383
0    383
Name: count, dtype: int64




Model 1 Accuracy: 0.555
              precision    recall  f1-score   support

           0       0.21      0.15      0.17        41
           1       0.63      0.63      0.63        63
           2       0.60      0.68      0.63        96

    accuracy                           0.56       200
   macro avg       0.48      0.49      0.48       200
weighted avg       0.53      0.56      0.54       200


Running Model 2:
Model 2 - SMOTE distribution:
Sentiment
1    383
2    383
0    383
Name: count, dtype: int64




Model 2 Ensemble Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        41
           1       1.00      1.00      1.00        63
           2       1.00      1.00      1.00        96

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200


Running Model 3:




Model 3 RF Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        41
           1       1.00      1.00      1.00        63
           2       1.00      1.00      1.00        96

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200


Running Model 4:


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Before Balancing: Counter({2: 479, 1: 318, 0: 203})
After Balancing: Counter({1: 479, 0: 479, 2: 479})


config.json:   0%|          | 0.00/881 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Model 4 - Epoch 1/3 Loss: 5.3917
Model 4 - Epoch 2/3 Loss: 4.7310
Model 4 - Epoch 3/3 Loss: 4.7243




Model 4 Accuracy: 0.3229166666666667
              precision    recall  f1-score   support

           0       0.32      0.35      0.34       109
           1       0.31      0.54      0.39        92
           2       0.56      0.06      0.10        87

    accuracy                           0.32       288
   macro avg       0.40      0.32      0.28       288
weighted avg       0.39      0.32      0.28       288


Running Model 5:


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Model 4 - Epoch 1/3 Loss: 4.8483
Model 4 - Epoch 2/3 Loss: 2.7159
Model 4 - Epoch 3/3 Loss: 2.1016




Model 4 Accuracy: 0.87
              precision    recall  f1-score   support

           0       0.61      1.00      0.75        40
           1       1.00      1.00      1.00        75
           2       1.00      0.69      0.82        85

    accuracy                           0.87       200
   macro avg       0.87      0.90      0.86       200
weighted avg       0.92      0.87      0.87       200


All models executed. To view MLflow UI, run:
mlflow ui --backend-store-uri file:/kaggle/working/mlruns
