# Prerequisite Code

In [1]:
# Installing dependencies 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
import os
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report, accuracy_score


In [2]:
# Fetching the preprocessed data as 'posts' and 'labels' to be used also

models = []
posts = []  
labels = [] 

folders = {
    "depression": {
        "path": "data/preprocessed/preprocessed_depression_posts",
        "label": 1  # Label for depression-related posts
    },
    "breastcancer": {
        "path": "data/preprocessed/preprocessed_breastcancer_posts",
        "label": 0  # Label for breast cancer posts
    }
}

for category, data in folders.items():
    folder_path = data["path"]
    label = data["label"]
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()  # Read the file content
                posts.append(content)  # Add to postst list
                labels.append(label)  # Add corresponding label


In [3]:
# Loading the feature extraction data
empath_file = "data/feature_extracted_data/empath_features_with_labels.csv"
lda_file = "data/feature_extracted_data/lda_topic_distributions_with_labels.csv"
unigram_file = "data/feature_extracted_data/unigram_features_with_labels.csv"
bigram_file = "data/feature_extracted_data/bigram_features_with_labels.csv"

In [4]:
# Creating a ModelTrainer class

class ModelTrainer:
    def __init__(self, csv_files, model, model_name, model_params=None, random_state=42):
        """
        Initialize the ModelTrainer class.

        Parameters:
        csv_files (list of str): List of file paths for the feature datasets (CSV files).
        model (class): Machine learning model class (e.g., LogisticRegression, SVC).
        model_name (str): Name of the model for identification.
        model_params (dict): Parameters for the model.
        random_state (int): Random seed for reproducibility.
        """
        self.csv_files = csv_files
        self.model_name = model_name
        self.model_class = model
        self.model_params = model_params if model_params else {}
        self.data = None
        self.model = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.random_state = random_state
        self.metrics = {}

    def load_and_combine_data(self):
        """
        Load and combine data from multiple CSV files into a single dataset.
        Assumes each CSV has the same 'label' column.
        """
        print(f"Loading data for {self.model_name}...")
        data_frames = [pd.read_csv(file) for file in self.csv_files]

        # Ensure labels are consistent and take from the first dataset
        labels = data_frames[0]['label']
        for df in data_frames[1:]:
            if 'label' in df.columns:
                df.drop(columns=['label'], inplace=True)

        combined_data = pd.concat(data_frames, axis=1)

        # Check alignment between features and labels
        if len(labels) != len(combined_data):
            raise ValueError(
                f"Mismatch between features and labels: "
                f"{len(combined_data)} rows in features, {len(labels)} in labels."
            )

        # Add the label column
        self.data = combined_data
        self.data['label'] = labels
        print(f"Loaded data shape: {self.data.shape}")

    def preprocess_data(self, test_size=0.2):
        """
        Split the data into training and testing sets.

        Parameters:
        test_size (float): Proportion of data to use for testing.
        """
        print("Splitting data into train and test sets...")
        X = self.data.iloc[:, :-1]  # All columns except the label column
        y = self.data['label']  # Label column

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=test_size, random_state=self.random_state
        )
        print(f"Training set size: {self.X_train.shape}, Test set size: {self.X_test.shape}")

    def train_model(self):
        """
        Train the specified model on the training data.
        """
        print(f"Training {self.model_name}...")
        self.model = self.model_class(**self.model_params)
        self.model.fit(self.X_train, self.y_train)
        print(f"Model {self.model_name} trained successfully.")

    def evaluate_model(self):
        """
        Evaluate the trained model on the test data and print metrics.
        """
        if self.model is None:
            raise ValueError(f"Model {self.model_name} has not been trained yet.")

        print(f"Evaluating {self.model_name}...")
        y_pred = self.model.predict(self.X_test)
        print(f"Accuracy: {accuracy_score(self.y_test, y_pred)}")

        # Calculate metrics
        accuracy = accuracy_score(self.y_test, y_pred)
        f1 = f1_score(self.y_test, y_pred, average='weighted')
        precision = precision_score(self.y_test, y_pred, average='weighted')
        recall = recall_score(self.y_test, y_pred, average='weighted')

        # Store metrics
        self.metrics = {
            "Model": self.model_name,
            "Accuracy": accuracy,
            "F1 Score": f1,
            "Precision": precision,
            "Recall": recall
        }

    def run_pipeline(self):
        """
        Complete pipeline: load data, preprocess, train, and evaluate.
        """
        self.load_and_combine_data()
        self.preprocess_data()
        self.train_model()
        self.evaluate_model()

In [5]:
# Creating a function for compiling metrics of the models

svm_models = []
mlp_models = []
lr_models = []
rf_models = []
ada_models = []
models = []

def compile_metrics(models):
    metrics_data = [model.metrics for model in models]
    metrics_df = pd.DataFrame(metrics_data)
    return metrics_df

# Adjustable hyperparameters and inputs

In [8]:
# Model configurations
models_config = [
    {
        "name": "AdaBoost",
        "model_class": AdaBoostClassifier,
        "params": {'n_estimators': 50, 'random_state': 42}
    },
    {
        "name": "SVM",
        "model_class": SVC,
        "params": {'C': 1.0, 'kernel': 'linear', 'random_state': 42}
    },
    {
        "name": "Random Forest",
        "model_class": RandomForestClassifier,
        "params": {'n_estimators': 100, 'random_state': 42}
    },
    {
        "name": "Logistic Regression",
        "model_class": LogisticRegression,
        "params": {'max_iter': 500, 'random_state': 42}
    },
    {
        "name": "MLP",
        "model_class": MLPClassifier,
        "params": {'hidden_layer_sizes': (100,), 'activation': 'relu', 'solver': 'adam', 'random_state': 42}
    }
]

In [9]:
# Define feature combinations
feature_combinations = [
    {"files": [empath_file], "name_suffix": "(Empath)"},
    {"files": [lda_file], "name_suffix": "(LDA)"},
    {"files": [unigram_file], "name_suffix": "(unigram)"},
    {"files": [bigram_file], "name_suffix": "(bigram)"},
    {"files": [empath_file, lda_file, unigram_file], "name_suffix": "(EM + LDA + unigram)"},
    {"files": [empath_file, lda_file, bigram_file], "name_suffix": "(EM + LDA + bigram)"}
]

# Training and Outputs

In [10]:
# Train and evaluate models
all_models = []
model_specific_results = {}


for model_config in models_config:
    model_name = model_config["name"]
    model_class = model_config["model_class"]
    model_params = model_config["params"]
    
    # For each feature combination
    specific_models = []
    for feature_combo in feature_combinations:
        feature_files = feature_combo["files"]
        name_suffix = feature_combo["name_suffix"]
        full_model_name = f"{model_name} {name_suffix}"
        
        print(f"\nTraining {full_model_name}...")
        trainer = ModelTrainer(feature_files, model_class, full_model_name, model_params)
        trainer.run_pipeline()
        
        # Store the model and its results
        all_models.append(trainer)
        specific_models.append(trainer)
    
    # Compile metrics for the specific model type
    model_specific_results[model_name] = compile_metrics(specific_models)

# Compile overall metrics for all models
overall_metrics_table = compile_metrics(all_models)
print("\nOverall Metrics Table:")
print(overall_metrics_table)


Training AdaBoost (Empath)...
Loading data for AdaBoost (Empath)...
Loaded data shape: (293, 48)
Splitting data into train and test sets...
Training set size: (234, 47), Test set size: (59, 47)
Training AdaBoost (Empath)...
Model AdaBoost (Empath) trained successfully.
Evaluating AdaBoost (Empath)...
Accuracy: 0.8983050847457628

Training AdaBoost (LDA)...
Loading data for AdaBoost (LDA)...
Loaded data shape: (293, 31)
Splitting data into train and test sets...
Training set size: (234, 30), Test set size: (59, 30)
Training AdaBoost (LDA)...
Model AdaBoost (LDA) trained successfully.
Evaluating AdaBoost (LDA)...
Accuracy: 0.864406779661017

Training AdaBoost (unigram)...
Loading data for AdaBoost (unigram)...
Loaded data shape: (293, 4296)
Splitting data into train and test sets...
Training set size: (234, 4295), Test set size: (59, 4295)
Training AdaBoost (unigram)...
Model AdaBoost (unigram) trained successfully.
Evaluating AdaBoost (unigram)...
Accuracy: 1.0

Training AdaBoost (bigr

  _warn_prf(average, modifier, msg_start, len(result))


Loaded data shape: (293, 4373)
Splitting data into train and test sets...
Training set size: (234, 4372), Test set size: (59, 4372)
Training Logistic Regression (EM + LDA + unigram)...
Model Logistic Regression (EM + LDA + unigram) trained successfully.
Evaluating Logistic Regression (EM + LDA + unigram)...
Accuracy: 1.0

Training Logistic Regression (EM + LDA + bigram)...
Loading data for Logistic Regression (EM + LDA + bigram)...
Loaded data shape: (293, 29786)
Splitting data into train and test sets...
Training set size: (234, 29785), Test set size: (59, 29785)
Training Logistic Regression (EM + LDA + bigram)...
Model Logistic Regression (EM + LDA + bigram) trained successfully.
Evaluating Logistic Regression (EM + LDA + bigram)...
Accuracy: 1.0

Training MLP (Empath)...
Loading data for MLP (Empath)...
Loaded data shape: (293, 48)
Splitting data into train and test sets...
Training set size: (234, 47), Test set size: (59, 47)
Training MLP (Empath)...




Model MLP (Empath) trained successfully.
Evaluating MLP (Empath)...
Accuracy: 0.864406779661017

Training MLP (LDA)...
Loading data for MLP (LDA)...
Loaded data shape: (293, 31)
Splitting data into train and test sets...
Training set size: (234, 30), Test set size: (59, 30)
Training MLP (LDA)...




Model MLP (LDA) trained successfully.
Evaluating MLP (LDA)...
Accuracy: 0.8305084745762712

Training MLP (unigram)...
Loading data for MLP (unigram)...
Loaded data shape: (293, 4296)
Splitting data into train and test sets...
Training set size: (234, 4295), Test set size: (59, 4295)
Training MLP (unigram)...
Model MLP (unigram) trained successfully.
Evaluating MLP (unigram)...
Accuracy: 1.0

Training MLP (bigram)...
Loading data for MLP (bigram)...
Loaded data shape: (293, 29709)
Splitting data into train and test sets...
Training set size: (234, 29708), Test set size: (59, 29708)
Training MLP (bigram)...
Model MLP (bigram) trained successfully.
Evaluating MLP (bigram)...
Accuracy: 0.864406779661017

Training MLP (EM + LDA + unigram)...
Loading data for MLP (EM + LDA + unigram)...
Loaded data shape: (293, 4373)
Splitting data into train and test sets...
Training set size: (234, 4372), Test set size: (59, 4372)
Training MLP (EM + LDA + unigram)...
Model MLP (EM + LDA + unigram) trained 