# Prerequisite Code

In [1]:
# Installing dependencies 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
import os
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report, accuracy_score


In [2]:
# Fetching the preprocessed data as 'posts' and 'labels' to be used also

models = []
posts = []  
labels = [] 

folders = {
    "depression": {
        "path": "data/preprocessed_posts/depression",
        "label": 1  # Label for depression-related posts
    },
    "breastcancer": {
        "path": "data/preprocessed_posts/breastcancer",
        "label": 0  # Label for breast cancer posts
    }
}

for category, data in folders.items():
    folder_path = data["path"]
    label = data["label"]
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()  # Read the file content
                posts.append(content)  # Add to postst list
                labels.append(label)  # Add corresponding label


In [3]:
# Loading the feature extraction data
empath_file = "data/feature_extracted_data/empath_features_with_labels.csv"
lda_file = "data/feature_extracted_data/lda_topic_distributions_with_labels.csv"
unigram_file = "data/feature_extracted_data/unigram_features_with_labels.csv"
bigram_file = "data/feature_extracted_data/bigram_features_with_labels.csv"

In [4]:
# Creating a ModelTrainer class

class ModelTrainer:
    def __init__(self, csv_files, model, model_name, model_params=None, random_state=42):
        """
        Initialize the ModelTrainer class.

        Parameters:
        csv_files (list of str): List of file paths for the feature datasets (CSV files).
        model (class): Machine learning model class (e.g., LogisticRegression, SVC).
        model_name (str): Name of the model for identification.
        model_params (dict): Parameters for the model.
        random_state (int): Random seed for reproducibility.
        """
        self.csv_files = csv_files
        self.model_name = model_name
        self.model_class = model
        self.model_params = model_params if model_params else {}
        self.data = None
        self.model = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.random_state = random_state
        self.metrics = {}

    def load_and_combine_data(self):
        """
        Load and combine data from multiple CSV files into a single dataset.
        Assumes each CSV has the same 'label' column.
        """
        data_frames = [pd.read_csv(file) for file in self.csv_files]

        # Ensure labels are consistent and take from the first dataset
        labels = data_frames[0]['label']
        for df in data_frames[1:]:
            if 'label' in df.columns:
                df.drop(columns=['label'], inplace=True)

        combined_data = pd.concat(data_frames, axis=1)

        # Check alignment between features and labels
        if len(labels) != len(combined_data):
            raise ValueError(
                f"Mismatch between features and labels: "
                f"{len(combined_data)} rows in features, {len(labels)} in labels."
            )

        # Add the label column
        self.data = combined_data
        self.data['label'] = labels

    def preprocess_data(self, test_size=0.2):
        """
        Split the data into training and testing sets.

        Parameters:
        test_size (float): Proportion of data to use for testing.
        """
        X = self.data.iloc[:, :-1]  # All columns except the label column
        y = self.data['label']  # Label column

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=test_size, random_state=self.random_state
        )

    def train_model(self):
        """
        Train the specified model on the training data.
        """
        self.model = self.model_class(**self.model_params)
        self.model.fit(self.X_train, self.y_train)
        print(f"Model {self.model_name} trained successfully.")

    def evaluate_model(self):
        """
        Evaluate the trained model on the test data and print metrics, including 10-fold cross-validation.
        """
        if self.model is None:
            raise ValueError(f"Model {self.model_name} has not been trained yet.")

        # Predictions and metrics for the test set
        y_pred = self.model.predict(self.X_test)
        accuracy = accuracy_score(self.y_test, y_pred)
        f1 = f1_score(self.y_test, y_pred, average='weighted')
        precision = precision_score(self.y_test, y_pred, average='weighted')
        recall = recall_score(self.y_test, y_pred, average='weighted')

        # Perform 10-fold cross-validation
        cv_scores = cross_val_score(self.model, self.X_train, self.y_train, cv=10, scoring='accuracy')
        cv_mean = cv_scores.mean()
        cv_std = cv_scores.std()


        # Store metrics
        self.metrics = {
            "Model": self.model_name,
            "Test Accuracy": accuracy,
            "F1 Score": f1,
            "Precision": precision,
            "Recall": recall,
            "CV Mean Accuracy": cv_mean,
            "CV Std Dev": cv_std
        }

    def run_pipeline(self):
        """
        Complete pipeline: load data, preprocess, train, and evaluate.
        """
        self.load_and_combine_data()
        self.preprocess_data()
        self.train_model()
        self.evaluate_model()

In [5]:
# Creating a function for compiling metrics of the models

svm_models = []
mlp_models = []
lr_models = []
rf_models = []
ada_models = []
models = []

def compile_metrics(models):
    metrics_data = [model.metrics for model in models]
    metrics_df = pd.DataFrame(metrics_data)
    return metrics_df

# Adjustable hyperparameters and inputs

In [8]:
# Model configurations
models_config = [
    {
        "name": "AdaBoost",
        "model_class": AdaBoostClassifier,
        "params": {'n_estimators': 50, 'random_state': 42}
    },
    {
        "name": "SVM",
        "model_class": SVC,
        "params": {'C': 1.0, 'kernel': 'linear', 'random_state': 42}
    },
    {
        "name": "Random Forest",
        "model_class": RandomForestClassifier,
        "params": {'n_estimators': 100, 'random_state': 42}
    },
    {
        "name": "Logistic Regression",
        "model_class": LogisticRegression,
        "params": {'max_iter': 500, 'random_state': 42}
    },
    {
        "name": "MLP",
        "model_class": MLPClassifier,
        "params": {'hidden_layer_sizes': (100,), 'activation': 'relu', 'solver': 'adam', 'random_state': 42}
    }
]

In [9]:
# Define feature combinations
feature_combinations = [
    {"files": [empath_file], "name_suffix": "(Empath)"},
    {"files": [lda_file], "name_suffix": "(LDA)"},
    {"files": [unigram_file], "name_suffix": "(unigram)"},
    {"files": [bigram_file], "name_suffix": "(bigram)"},
    {"files": [empath_file, lda_file, unigram_file], "name_suffix": "(EM + LDA + unigram)"},
    {"files": [empath_file, lda_file, bigram_file], "name_suffix": "(EM + LDA + bigram)"}
]

# Training and Outputs

In [10]:
# Train and evaluate models
all_models = []
model_specific_results = {}


for model_config in models_config:
    model_name = model_config["name"]
    model_class = model_config["model_class"]
    model_params = model_config["params"]
    
    # For each feature combination
    specific_models = []
    for feature_combo in feature_combinations:
        feature_files = feature_combo["files"]
        name_suffix = feature_combo["name_suffix"]
        full_model_name = f"{model_name} {name_suffix}"
        
        print(f"\nTraining {full_model_name}...")
        trainer = ModelTrainer(feature_files, model_class, full_model_name, model_params)
        trainer.run_pipeline()
        
        # Store the model and its results
        all_models.append(trainer)
        specific_models.append(trainer)
    
    # Compile metrics for the specific model type
    model_specific_results[model_name] = compile_metrics(specific_models)

# Compile overall metrics for all models
overall_metrics_table = compile_metrics(all_models)
print("\nOverall Metrics Table:")
print(overall_metrics_table)


Training AdaBoost (Empath)...
Model AdaBoost (Empath) trained successfully.

Training AdaBoost (LDA)...
Model AdaBoost (LDA) trained successfully.

Training AdaBoost (unigram)...
Model AdaBoost (unigram) trained successfully.

Training AdaBoost (bigram)...
Model AdaBoost (bigram) trained successfully.

Training AdaBoost (EM + LDA + unigram)...
Model AdaBoost (EM + LDA + unigram) trained successfully.

Training AdaBoost (EM + LDA + bigram)...
Model AdaBoost (EM + LDA + bigram) trained successfully.

Training SVM (Empath)...
Model SVM (Empath) trained successfully.

Training SVM (LDA)...
Model SVM (LDA) trained successfully.

Training SVM (unigram)...
Model SVM (unigram) trained successfully.

Training SVM (bigram)...
Model SVM (bigram) trained successfully.

Training SVM (EM + LDA + unigram)...
Model SVM (EM + LDA + unigram) trained successfully.

Training SVM (EM + LDA + bigram)...
Model SVM (EM + LDA + bigram) trained successfully.

Training Random Forest (Empath)...
Model Random For

KeyboardInterrupt: 