# Prerequisite Code

In [2]:
# Installing dependencies 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
import os
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report, accuracy_score


  from pandas.core import (


In [3]:
posts = []  
labels = [] 

folders = {
    "depression": {
        "path": "data/preprocessed_posts/depression",
        "label": 1  # Label for depression-related posts
    },
    "standard": {
        "path": "data/preprocessed_posts/standard",
        "label": 0  # Label for standard posts
    },
    "breastcancer": {
        "path": "data/preprocessed_posts/breastcancer",
        "label": 2  # Label for breast cancer posts
    }
}

for category, data in folders.items():
    folder_path = data["path"]
    label = data["label"]
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()  # Read the file content
                posts.append(content)  # Add to posts list
                labels.append(label)  # Add corresponding label


In [4]:
# Filter data for each task
bc_vs_dep_posts = [post for i, post in enumerate(posts) if labels[i] in [1, 2]]
bc_vs_dep_labels = [label for label in labels if label in [1, 2]]

std_vs_dep_posts = [post for i, post in enumerate(posts) if labels[i] in [0, 1]]
std_vs_dep_labels = [label for label in labels if label in [0, 1]]


# Loading the feature extraction data
empath_file = "data/feature_extracted_data/empath_features_with_labels.csv"
lda_file = "data/feature_extracted_data/lda_topic_distributions_with_labels.csv"
unigram_file = "data/feature_extracted_data/unigram_features_with_labels.csv"
bigram_file = "data/feature_extracted_data/bigram_features_with_labels.csv"


In [5]:
# Creating a ModelTrainer class
class ModelTrainer:
    def __init__(self, csv_files, model, model_name, model_params=None, random_state=42):
        self.csv_files = csv_files
        self.model_name = model_name
        self.model_class = model
        self.model_params = model_params if model_params else {}
        self.data = None
        self.model = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.random_state = random_state
        self.metrics = {}

    def load_and_combine_data(self):
        data_frames = [pd.read_csv(file) for file in self.csv_files]

        # Ensure labels are consistent and take from the first dataset
        labels = data_frames[0]['label']
        for df in data_frames[1:]:
            if 'label' in df.columns:
                df.drop(columns=['label'], inplace=True)

        combined_data = pd.concat(data_frames, axis=1)

        # Check alignment between features and labels
        if len(labels) != len(combined_data):
            raise ValueError(
                f"Mismatch between features and labels: "
                f"{len(combined_data)} rows in features, {len(labels)} in labels."
            )

        # Add the label column
        self.data = combined_data
        self.data['label'] = labels

    def preprocess_data(self, test_size=0.2):
        X = self.data.iloc[:, :-1]  # All columns except the label column
        y = self.data['label']  # Label column

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=test_size, random_state=self.random_state
        )

    def train_model(self):
        self.model = self.model_class(**self.model_params)
        self.model.fit(self.X_train, self.y_train)
        print(f"Model {self.model_name} trained successfully.")

    def evaluate_model(self):
        if self.model is None:
            raise ValueError(f"Model {self.model_name} has not been trained yet.")

        # Predictions and metrics for the test set
        y_pred = self.model.predict(self.X_test)
        accuracy = accuracy_score(self.y_test, y_pred)
        f1 = f1_score(self.y_test, y_pred, average='weighted')
        precision = precision_score(self.y_test, y_pred, average='weighted')
        recall = recall_score(self.y_test, y_pred, average='weighted')

        # Perform 10-fold cross-validation
        cv_scores = cross_val_score(self.model, self.X_train, self.y_train, cv=10, scoring='accuracy')
        cv_mean = cv_scores.mean()
        cv_std = cv_scores.std()

        # Store metrics
        self.metrics = {
            "Model": self.model_name,
            "Test Accuracy": accuracy,
            "F1 Score": f1,
            "Precision": precision,
            "Recall": recall,
            "CV Mean Accuracy": cv_mean,
            "CV Std Dev": cv_std
        }

    def run_pipeline(self):
        self.load_and_combine_data()
        self.preprocess_data()
        self.train_model()
        self.evaluate_model()

In [7]:
# Define feature combinations
feature_combinations = [
    {"files": [empath_file], "name_suffix": "(Empath)"},
    {"files": [lda_file], "name_suffix": "(LDA)"},
    {"files": [unigram_file], "name_suffix": "(Unigram)"},
    {"files": [bigram_file], "name_suffix": "(Bigram)"},
    {"files": [empath_file, lda_file, unigram_file], "name_suffix": "(Empath + LDA + Unigram)"},
    {"files": [empath_file, lda_file, bigram_file], "name_suffix": "(Empath + LDA + Bigram)"},
]


# Adjustable hyperparameters and inputs

In [8]:
# Model configurations
models_config = [
    {
        "name": "AdaBoost",
        "model_class": AdaBoostClassifier,
        "params": {'n_estimators': 50, 'random_state': 42}
    },
    {
        "name": "SVM",
        "model_class": SVC,
        "params": {'C': 1.0, 'kernel': 'linear', 'random_state': 42}
    },
    {
        "name": "Random Forest",
        "model_class": RandomForestClassifier,
        "params": {'n_estimators': 100, 'random_state': 42}
    },
    {
        "name": "Logistic Regression",
        "model_class": LogisticRegression,
        "params": {'max_iter': 500, 'random_state': 42}
    },
    {
        "name": "MLP",
        "model_class": MLPClassifier,
        "params": {'hidden_layer_sizes': (100,), 'activation': 'relu', 'solver': 'adam', 'random_state': 42}
    }
]

# Training and Outputs

In [13]:
# Task 1: Breast Cancer vs Depression
print("\nTraining models for Breast Cancer vs Depression...")
bc_vs_dep_models = []
for model_config in models_config:
    model_name = model_config["name"]
    model_class = model_config["model_class"]
    model_params = model_config["params"]

    for feature_combo in feature_combinations:
        feature_files = feature_combo["files"]
        name_suffix = feature_combo["name_suffix"]
        full_model_name = f"{model_name} {name_suffix} (Breast Cancer vs Depression)"
        
        print(f"\nTraining {full_model_name}...")
        trainer = ModelTrainer(feature_files, model_class, full_model_name, model_params)
        trainer.run_pipeline()
        bc_vs_dep_models.append(trainer)


Training models for Breast Cancer vs Depression...

Training AdaBoost (Empath) (Breast Cancer vs Depression)...
Model AdaBoost (Empath) (Breast Cancer vs Depression) trained successfully.

Training AdaBoost (LDA) (Breast Cancer vs Depression)...
Model AdaBoost (LDA) (Breast Cancer vs Depression) trained successfully.

Training AdaBoost (Unigram) (Breast Cancer vs Depression)...
Model AdaBoost (Unigram) (Breast Cancer vs Depression) trained successfully.

Training AdaBoost (Bigram) (Breast Cancer vs Depression)...
Model AdaBoost (Bigram) (Breast Cancer vs Depression) trained successfully.

Training AdaBoost (Empath + LDA + Unigram) (Breast Cancer vs Depression)...
Model AdaBoost (Empath + LDA + Unigram) (Breast Cancer vs Depression) trained successfully.

Training AdaBoost (Empath + LDA + Bigram) (Breast Cancer vs Depression)...
Model AdaBoost (Empath + LDA + Bigram) (Breast Cancer vs Depression) trained successfully.

Training SVM (Empath) (Breast Cancer vs Depression)...
Model SVM (E



Model MLP (Empath) (Breast Cancer vs Depression) trained successfully.





Training MLP (LDA) (Breast Cancer vs Depression)...




Model MLP (LDA) (Breast Cancer vs Depression) trained successfully.





Training MLP (Unigram) (Breast Cancer vs Depression)...
Model MLP (Unigram) (Breast Cancer vs Depression) trained successfully.

Training MLP (Bigram) (Breast Cancer vs Depression)...
Model MLP (Bigram) (Breast Cancer vs Depression) trained successfully.

Training MLP (Empath + LDA + Unigram) (Breast Cancer vs Depression)...
Model MLP (Empath + LDA + Unigram) (Breast Cancer vs Depression) trained successfully.

Training MLP (Empath + LDA + Bigram) (Breast Cancer vs Depression)...
Model MLP (Empath + LDA + Bigram) (Breast Cancer vs Depression) trained successfully.




In [15]:
# Task 2: Standard vs Depression
print("\nTraining models for Standard vs Depression...")
std_vs_dep_models = []
for model_config in models_config:
    model_name = model_config["name"]
    model_class = model_config["model_class"]
    model_params = model_config["params"]

    for feature_combo in feature_combinations:
        feature_files = feature_combo["files"]
        name_suffix = feature_combo["name_suffix"]
        full_model_name = f"{model_name} {name_suffix} (Standard vs Depression)"
        
        print(f"\nTraining {full_model_name}...")
        trainer = ModelTrainer(feature_files, model_class, full_model_name, model_params)
        trainer.run_pipeline()
        std_vs_dep_models.append(trainer)


Training models for Standard vs Depression...

Training AdaBoost (Empath) (Standard vs Depression)...
Model AdaBoost (Empath) (Standard vs Depression) trained successfully.

Training AdaBoost (LDA) (Standard vs Depression)...
Model AdaBoost (LDA) (Standard vs Depression) trained successfully.

Training AdaBoost (Unigram) (Standard vs Depression)...
Model AdaBoost (Unigram) (Standard vs Depression) trained successfully.

Training AdaBoost (Bigram) (Standard vs Depression)...
Model AdaBoost (Bigram) (Standard vs Depression) trained successfully.

Training AdaBoost (Empath + LDA + Unigram) (Standard vs Depression)...
Model AdaBoost (Empath + LDA + Unigram) (Standard vs Depression) trained successfully.

Training AdaBoost (Empath + LDA + Bigram) (Standard vs Depression)...
Model AdaBoost (Empath + LDA + Bigram) (Standard vs Depression) trained successfully.

Training SVM (Empath) (Standard vs Depression)...
Model SVM (Empath) (Standard vs Depression) trained successfully.

Training SVM (L

MemoryError: Unable to allocate 79.3 MiB for an array with shape (68385, 152) and data type float64

In [11]:
def compile_metrics(models):
    metrics_data = [model.metrics for model in models]
    metrics_df = pd.DataFrame(metrics_data)
    return metrics_df

# Compile overall metrics
overall_metrics_bc_dep = compile_metrics(bc_vs_dep_models)
overall_metrics_std_dep = compile_metrics(std_vs_dep_models)

print("\nMetrics for Breast Cancer vs Depression:")
print(overall_metrics_bc_dep)

print("\nMetrics for Standard vs Depression:")
print(overall_metrics_std_dep)

NameError: name 'std_vs_dep_models' is not defined