In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import time

# import warnings
# warnings.filterwarnings('ignore')

original_corpus = pd.read_csv('data/mental_health.csv')
cleaned_corpus = pd.read_csv('data/cleaned_mhc.csv')

print("Shape of Original Corpus", original_corpus.shape)
print("Shape of Cleaned Corpus", cleaned_corpus.shape)

Shape of Original Corpus (27977, 2)
Shape of Cleaned Corpus (23240, 2)


In [6]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import time
import pandas as pd

# Define hyperparameters to test
C_values = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
penalties = ['l1', 'l2']
solvers = {
    'l1': 'liblinear',
    'l2': 'lbfgs'
}

def analyze_coefficients(model, vectorizer, dataset_name):
    # Get feature names and coefficients
    feature_names = vectorizer.get_feature_names_out()
    coefficients = model.coef_[0]  # For binary classification
    
    # Create a list of (term, coefficient) tuples
    coef_pairs = list(zip(feature_names, coefficients))
    
    # Sort by absolute coefficient value
    sorted_pairs = sorted(coef_pairs, key=lambda x: abs(x[1]), reverse=True)
    
    print(f"\nCoefficient Analysis for {dataset_name} Corpus:")
    print("\nTop 20 Most Influential Terms:")
    print("{:<20} {:<15}".format("Term", "Coefficient"))
    print("-" * 35)
    for term, coef in sorted_pairs[:20]:
        print("{:<20} {:<15.4f}".format(term, coef))
        
    print("\nTop 20 Least Influential Terms:")
    print("{:<20} {:<15}".format("Term", "Coefficient"))
    print("-" * 35)
    for term, coef in sorted_pairs[-20:]:
        print("{:<20} {:<15.4f}".format(term, coef))

def run_logistic_regression(corpus, target, dataset_name):
    best_f1 = 0
    best_model = None
    best_vectorizer = None
    best_conf_matrix = None
    best_params = None
    total_fit_time = 0
    total_test_time = 0
    num_iterations = 0
    
    # Split data once
    X_train, X_test, y_train, y_test = train_test_split(
        corpus, target, test_size=0.2, random_state=42
    )
    
    # Vectorize data
    vectorizer = TfidfVectorizer(max_features=3500)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    
    for penalty in penalties:
        for C in C_values:
            num_iterations += 1
            
            # Initialize model
            lr = LogisticRegression(
                C=C,
                penalty=penalty,
                solver=solvers[penalty],
                max_iter=1000,
                random_state=42
            )
            
            # Fit model and time it
            start_fit = time.perf_counter()
            lr.fit(X_train_tfidf, y_train)
            fit_time = time.perf_counter() - start_fit
            total_fit_time += fit_time
            
            # Test model and time it
            start_test = time.perf_counter()
            y_test_pred = lr.predict(X_test_tfidf)
            test_time = time.perf_counter() - start_test
            total_test_time += test_time
            
            # Calculate F1 score
            f1 = f1_score(y_test, y_test_pred)
            
            # Update best model info
            if f1 > best_f1:
                best_f1 = f1
                best_model = lr
                best_vectorizer = vectorizer
                best_conf_matrix = confusion_matrix(y_test, y_test_pred)
                best_params = {"C": C, "penalty": penalty}
    
    # Calculate average times
    avg_fit_time = total_fit_time / num_iterations
    avg_test_time = total_test_time / num_iterations
    
    print(f"\nResults for {dataset_name} Corpus:")
    print(f"Best F1 Score (C={best_params['C']}, penalty={best_params['penalty']}): {best_f1:.4f}")
    print(f"Average Fit Time: {avg_fit_time:.4f} s")
    print(f"Average Test Time: {avg_test_time:.4f} s")
    print("Confusion Matrix for Best Model:\n", best_conf_matrix)
    
    return best_model, best_vectorizer

# Run evaluation for both datasets and store best models
best_model_original, best_vectorizer_original = run_logistic_regression(
    original_corpus['text'], original_corpus['label'], "Original"
)
best_model_cleaned, best_vectorizer_cleaned = run_logistic_regression(
    cleaned_corpus['text'], cleaned_corpus['label'], "Cleaned"
)

# Analyze coefficients for both models
analyze_coefficients(best_model_original, best_vectorizer_original, "Original")
analyze_coefficients(best_model_cleaned, best_vectorizer_cleaned, "Cleaned")


Results for Original Corpus:
Best F1 Score (C=1.0, penalty=l2): 0.9206
Average Fit Time: 0.0918 s
Average Test Time: 0.0005 s
Confusion Matrix for Best Model:
 [[2598  204]
 [ 237 2557]]

Results for Cleaned Corpus:
Best F1 Score (C=1.0, penalty=l2): 0.9295
Average Fit Time: 0.1501 s
Average Test Time: 0.0004 s
Confusion Matrix for Best Model:
 [[1921  178]
 [ 181 2368]]

Coefficient Analysis for Original Corpus:

Top 20 Most Influential Terms:
Term                 Coefficient    
-----------------------------------
redflag              7.9595         
kill                 7.4589         
suicidal             5.5458         
die                  5.5143         
br                   -5.2353        
life                 5.0054         
film                 -4.8867        
cannot               4.8758         
movie                -4.7338        
killing              4.2968         
depression           4.2608         
myself               4.1067         
end                  3.9631      