In [22]:
# Installing dependencies 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
from sklearn.linear_model import LogisticRegression
import os
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt

In [12]:
# Fetching the preprocessed data as 'posts' and 'labels' to be used also

posts = []  
labels = [] 

folders = {
    "depression": {
        "path": "data/preprocessed/preprocessed_depression_posts",
        "label": 1  # Label for depression-related posts
    },
    "breastcancer": {
        "path": "data/preprocessed/preprocessed_breastcancer_posts",
        "label": 0  # Label for breast cancer posts
    }
}

for category, data in folders.items():
    folder_path = data["path"]
    label = data["label"]
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()  # Read the file content
                posts.append(content)  # Add to postst list
                labels.append(label)  # Add corresponding label


## TODO: SVM (x6)

### SVM on unigram data

In [None]:
# SVM on unigram data

# Load the unigram features and labels
unigram_data_file = "data/feature_extracted_data/unigram_features_with_labels.csv"  # Update path if needed
unigram_df = pd.read_csv(unigram_data_file)

# Separate features and labels
X = unigram_df.iloc[:, :-1].values  # All columns except the last one are features
y = unigram_df.iloc[:, -1].values  # The last column is the label

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train an SVM model
svm_model = SVC(kernel='linear', random_state=42)  # Use a linear kernel for interpretability
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Visualisation of the unigram feature distribution using t-SNE

# Reduce dimensionality with t-SNE
tsne = TSNE(n_components=2, random_state=42)
X_test_tsne = tsne.fit_transform(X_test)  # Apply t-SNE to test data

# Visualize the results
plt.figure(figsize=(10, 7))
scatter = plt.scatter(X_test_tsne[:, 0], X_test_tsne[:, 1], c=y_test, cmap='viridis', s=15, alpha=0.8)
plt.title("t-SNE Visualization of Unigram Features (Test Data)")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")

# Add legend for the labels
plt.legend(handles=scatter.legend_elements()[0], labels=["Non-Depressed", "Depressed"], title="Labels")
plt.show()

In [None]:
# Seeing which words are most important for the SVM model

# Get the coefficients and feature names
coefficients = svm_model.coef_.flatten()  # Coefficients for each feature
unigram_features = unigram_df.columns[:-1]  # Feature names (all except the last column)

# Create a DataFrame to pair unigrams with their coefficients
coef_df = pd.DataFrame({
    "Unigram": unigram_features,
    "Coefficient": coefficients
})

# Sort by coefficient values (absolute values indicate importance)
coef_df_sorted = coef_df.reindex(coef_df.Coefficient.abs().sort_values(ascending=False).index)

# Print top unigrams for depression (positive coefficients) and non-depression (negative coefficients)
print("Top Unigrams Associated with Depression:")
print(coef_df_sorted[coef_df_sorted["Coefficient"] > 0].head(10))

print("\nTop Unigrams Associated with Non-Depression:")
print(coef_df_sorted[coef_df_sorted["Coefficient"] < 0].head(10))

### SVM on bigrams

In [None]:
# SVM on bigram data
# Load the dataset
bigram_data_file = "data/feature_extracted_data/bigram_features_with_labels.csv"  # Adjust path if necessary
bigram_df = pd.read_csv(bigram_data_file)

# Separate features and labels
X = bigram_df.iloc[:, :-1].values  # All columns except the last one (features)
y = bigram_df['label'].values     # The last column contains the labels

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the SVM model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions
y_pred = svm_model.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Visualisation of the bigram feature distribution using t-SNE

# Reduce dimensionality with t-SNE
tsne = TSNE(n_components=2, random_state=42)
X_test_tsne = tsne.fit_transform(X_test)  # Apply t-SNE to test data

# Visualize the results
plt.figure(figsize=(10, 7))
scatter = plt.scatter(X_test_tsne[:, 0], X_test_tsne[:, 1], c=y_test, cmap='viridis', s=15, alpha=0.8)
plt.title("t-SNE Visualization of Bigram Features (Test Data)")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")

# Add legend for the labels
plt.legend(handles=scatter.legend_elements()[0], labels=["Non-Depressed", "Depressed"], title="Labels")
plt.show()

In [None]:
# Seeing which bigrams are most important for the SVM model
# Get the coefficients and feature names
coefficients = svm_model.coef_.flatten()  # Coefficients for each feature
bigram_features = bigram_df.columns[:-1]  # Feature names (all except the last column)

# Create a DataFrame to pair unigrams with their coefficients
coef_df = pd.DataFrame({
    "Unigram": bigram_features,
    "Coefficient": coefficients
})

# Sort by coefficient values (absolute values indicate importance)
coef_df_sorted = coef_df.reindex(coef_df.Coefficient.abs().sort_values(ascending=False).index)

# Print top unigrams for depression (positive coefficients) and non-depression (negative coefficients)
print("Top Bigrams Associated with Depression:")
print(coef_df_sorted[coef_df_sorted["Coefficient"] > 0].head(10))

print("\nTop Bigrams Associated with Non-Depression:")
print(coef_df_sorted[coef_df_sorted["Coefficient"] < 0].head(10))

## TODO: MLP (x6) 

## Linear Regression

In [23]:
# Linear regriession model class
class LinearRegressionModel:
    def __init__(self, csv_files, model_name, random_state=42):
        """
        Initialize the LinearRegressionModel class.

        Parameters:
        csv_files (list of str): List of file paths for the feature datasets (CSV files).
        model_name (str): Name of the model for identification.
        """
        self.csv_files = csv_files
        self.model_name = model_name
        self.data = None
        self.model = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.random_state = random_state

    def load_and_combine_data(self):
        """
        Load and combine data from multiple CSV files into a single dataset.
        Assumes each CSV has the same 'label' column.
        """
        print(f"Loading data for {self.model_name}...")
        data_frames = [pd.read_csv(file) for file in self.csv_files]
    
        # Ensure labels are consistent and take from the first dataset
        labels = data_frames[0]['label']
        for df in data_frames[1:]:
            if 'label' in df.columns:
                df.drop(columns=['label'], inplace=True)

        combined_data = pd.concat(data_frames, axis=1)

        # Check alignment between features and labels
        if len(labels) != len(combined_data):
            raise ValueError(
                f"Mismatch between features and labels: "
                f"{len(combined_data)} rows in features, {len(labels)} in labels."
            )

        # Add the label column
        self.data = combined_data
        self.data['label'] = labels
        print(f"Loaded data shape: {self.data.shape}")

    def preprocess_data(self, test_size=0.2, random_state=42):
        """
        Split the data into training and testing sets.

        Parameters:
        test_size (float): Proportion of data to use for testing.
        random_state (int): Seed for reproducibility.
        """
        print("Splitting data into train and test sets...")
        X = self.data.iloc[:, :-1]  # All columns except the label column
        y = self.data['label']  # Label column

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=test_size, random_state=self.random_state
        )
        print(f"Training set size: {self.X_train.shape}, Test set size: {self.X_test.shape}")

    def train_model(self):
        """
        Train a Logistic Regression model on the training data.
        """
        print(f"Training {self.model_name}...")
        self.model = LogisticRegression(max_iter=500)
        self.model.fit(self.X_train, self.y_train)
        print(f"Model {self.model_name} trained successfully.")

    def evaluate_model(self):
        """
        Evaluate the trained model on the test data and print metrics.
        """
        if self.model is None:
            raise ValueError(f"Model {self.model_name} has not been trained yet.")
        
        print(f"Evaluating {self.model_name}...")
        y_pred = self.model.predict(self.X_test)
        print("\nClassification Report:")
        print(classification_report(self.y_test, y_pred))
        print(f"Accuracy: {accuracy_score(self.y_test, y_pred)}")

    def run_pipeline(self):
        """
        Complete pipeline: load data, preprocess, train, and evaluate.
        """
        self.load_and_combine_data()
        self.preprocess_data()
        self.train_model()
        self.evaluate_model()

In [24]:
# Loading the feature extraction data
empath_file = "data/feature_extracted_data/empath_features_with_labels.csv"
lda_file = "data/feature_extracted_data/lda_topic_distributions_with_labels.csv"
unigram_file = "data/feature_extracted_data/unigram_features_with_labels.csv"
bigram_file = "data/feature_extracted_data/bigram_features_with_labels.csv"

In [None]:
# LR (Empath)
lr_em = LinearRegressionModel([empath_file], "LR (Em)")
lr_em.run_pipeline()

In [None]:
# LR (LDA)
lr_lda = LinearRegressionModel([lda_file], "LR (LDA)")
lr_lda.run_pipeline()


In [None]:
# LR (unigram)
lr_unigram = LinearRegressionModel([unigram_file], "LR (Unigram)")
lr_unigram.run_pipeline()


In [None]:
# LR (bigram)
lr_bigram = LinearRegressionModel([bigram_file], "LR (Bigram)")
lr_bigram.run_pipeline()

In [None]:
# LR (EM + LDA + unigram)
lr_em_lda_unigram = LinearRegressionModel([empath_file, lda_file, unigram_file], "LR (EM + LDA + Unigram)")
lr_em_lda_unigram.run_pipeline()

In [None]:
# LR (EM + LDA + bigram)
lr_em_lda_bigram = LinearRegressionModel([empath_file, lda_file, bigram_file], "LR (EM + LDA + Bigram)")
lr_em_lda_bigram.run_pipeline()

## TODO: RF (x6)

In [None]:
# Random Forest on unigram data

# Train a Random Forest model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

# Make predictions using the RF model
y_pred_rf = rf_model.predict(X_test)

# Evaluate the RF model
print("Random Forest Accuracy (Unigram):", accuracy_score(y_test, y_pred_rf))
print("\nRandom Forest Classification Report (Unigram):\n", classification_report(y_test, y_pred_rf))


In [None]:
# Random Forest on bigram data

# Train a Random Forest model on bigram data
rf_model_bigram = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model_bigram.fit(X_train_bigram, y_train_bigram)

# Make predictions using the RF model on bigram data
y_pred_rf_bigram = rf_model_bigram.predict(X_test_bigram)

# Evaluate the RF model on bigram data
print("Random Forest Accuracy (Bigram):", accuracy_score(y_test_bigram, y_pred_rf_bigram))
print("\nRandom Forest Classification Report (Bigram):\n", classification_report(y_test_bigram, y_pred_rf_bigram))


## TODO: ADA (x6)

In [None]:
# AdaBoost on unigram data

# Train an AdaBoost model
ada_model = AdaBoostClassifier(random_state=42, n_estimators=50)
ada_model.fit(X_train, y_train)

# Make predictions using the ADA model
y_pred_ada = ada_model.predict(X_test)

# Evaluate the ADA model
print("AdaBoost Accuracy (Unigram):", accuracy_score(y_test, y_pred_ada))
print("\nAdaBoost Classification Report (Unigram):\n", classification_report(y_test, y_pred_ada))


In [None]:
# AdaBoost on bigram data

# Train an AdaBoost model on bigram data
ada_model_bigram = AdaBoostClassifier(random_state=42, n_estimators=50)
ada_model_bigram.fit(X_train_bigram, y_train_bigram)

# Make predictions using the ADA model on bigram data
y_pred_ada_bigram = ada_model_bigram.predict(X_test_bigram)

# Evaluate the ADA model on bigram data
print("AdaBoost Accuracy (Bigram):", accuracy_score(y_test_bigram, y_pred_ada_bigram))
print("\nAdaBoost Classification Report (Bigram):\n", classification_report(y_test_bigram, y_pred_ada_bigram))
