In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import time

# Load data
corpus = pd.read_csv('data/cleaned_mhc.csv')

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    corpus['text'],
    corpus['label'],
    test_size=0.2,
    random_state=42
)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=3500)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Logistic Regression with L2 regularization and C=5
logreg = LogisticRegression(penalty='l2', C=5, solver='liblinear', random_state=42, max_iter=1000)

# Train the model
start_fit = time.time()
logreg.fit(X_train_tfidf, y_train)
fit_time = time.time() - start_fit

# Test the model
start_test = time.time()
y_train_pred = logreg.predict(X_train_tfidf)
y_test_pred = logreg.predict(X_test_tfidf)
test_time = time.time() - start_test

# Metrics
metrics = {
    'Model': ['TF-IDF + Logistic Regression (L2, C=5)'],
    'Train Accuracy': [accuracy_score(y_train, y_train_pred)],
    'Test Accuracy': [accuracy_score(y_test, y_test_pred)],
    'F1 Score': [f1_score(y_test, y_test_pred)],
    'Precision': [precision_score(y_test, y_test_pred)],
    'Recall': [recall_score(y_test, y_test_pred)],
    'Fit Time (s)': [fit_time],
    'Test Time (s)': [test_time],
    'Confusion Matrix': [confusion_matrix(y_test, y_test_pred)]
}

y_train_score = logreg_lsa.predict_proba(X_train_lsa)[:, 1]  # Probabilities for class 1
y_test_score = logreg_lsa.predict_proba(X_test_lsa)[:, 1]    # Probabilities for class 1

# Create the regression dataset with token names as feature names
token_features = tfidf.get_feature_names_out()

# For training set
X_train_df = pd.DataFrame(X_train_tfidf.toarray(), columns=token_features)
X_train_df['Problematic_Rate'] = y_train_score

# For test set
X_test_df = pd.DataFrame(X_test_tfidf.toarray(), columns=token_features)
X_test_df['Problematic_Rate'] = y_test_score

# Combine train and test data
regression_df = pd.concat([X_train_df, X_test_df], axis=0)

# Save to CSV
regression_df.to_csv('tfidf_regression_dataset_logreg.csv', index=False)

# Print results
results_df = pd.DataFrame(metrics)
print(results_df)

# Print Confusion Matrix
print("\nConfusion Matrices:")
print(f"TF-IDF + Logistic Regression Confusion Matrix:\n{metrics['Confusion Matrix'][0]}")

                                    Model  Train Accuracy  Test Accuracy  \
0  TF-IDF + Logistic Regression (L2, C=5)         0.95611        0.92642   

   F1 Score  Precision    Recall  Fit Time (s)  Test Time (s)  \
0  0.932757   0.934963  0.930561      0.099616       0.001579   

             Confusion Matrix  
0  [[1934, 165], [177, 2372]]  

Confusion Matrices:
TF-IDF + Logistic Regression Confusion Matrix:
[[1934  165]
 [ 177 2372]]


In [17]:
from sklearn.decomposition import TruncatedSVD

# Load data
corpus = pd.read_csv('data/cleaned_mhc.csv')

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    corpus['text'],
    corpus['label'],
    test_size=0.2,
    random_state=42
)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=3500)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Apply LSA (TruncatedSVD)
n_components = 100
lsa = TruncatedSVD(n_components=n_components, random_state=42)
X_train_lsa = lsa.fit_transform(X_train_tfidf)
X_test_lsa = lsa.transform(X_test_tfidf)

# Logistic Regression with L2 regularization and C=5
logreg_lsa = LogisticRegression(penalty='l2', C=5, solver='liblinear', random_state=42, max_iter=1000)

# Train the model
start_fit = time.time()
logreg_lsa.fit(X_train_lsa, y_train)
fit_time = time.time() - start_fit

# Test the model
start_test = time.time()
y_train_pred = logreg_lsa.predict(X_train_lsa)
y_test_pred = logreg_lsa.predict(X_test_lsa)
test_time = time.time() - start_test

# Metrics
metrics = {
    'Model': ['LSA + Logistic Regression (L2, C=5)'],
    'Train Accuracy': [accuracy_score(y_train, y_train_pred)],
    'Test Accuracy': [accuracy_score(y_test, y_test_pred)],
    'F1 Score': [f1_score(y_test, y_test_pred)],
    'Precision': [precision_score(y_test, y_test_pred)],
    'Recall': [recall_score(y_test, y_test_pred)],
    'Fit Time (s)': [fit_time],
    'Test Time (s)': [test_time],
    'Confusion Matrix': [confusion_matrix(y_test, y_test_pred)]
}

y_train_score = logreg_lsa.predict_proba(X_train_lsa)[:, 1]  # Probabilities for class 1
y_test_score = logreg_lsa.predict_proba(X_test_lsa)[:, 1]    # Probabilities for class 1

# Create the regression dataset with Topic001 to Topic100 as feature names
topic_features = [f'Topic{str(i).zfill(3)}' for i in range(1, n_components + 1)]

# For training set
X_train_df = pd.DataFrame(X_train_lsa, columns=topic_features)
X_train_df['Problematic_Rate'] = y_train_score

# For test set
X_test_df = pd.DataFrame(X_test_lsa, columns=topic_features)
X_test_df['Problematic_Rate'] = y_test_score

# Combine train and test data
regression_df = pd.concat([X_train_df, X_test_df], axis=0)

# Save to CSV
regression_df.to_csv('lsa_regression_dataset_logreg.csv', index=False)
print("Regression dataset for LSA with Logistic Regression created")

# Print results
results_df = pd.DataFrame(metrics)
print(results_df)

# Print Confusion Matrix
print("\nConfusion Matrices:")
print(f"LSA + Logistic Regression Confusion Matrix:\n{metrics['Confusion Matrix'][0]}")

Regression dataset for LSA with Logistic Regression created
                                 Model  Train Accuracy  Test Accuracy  \
0  LSA + Logistic Regression (L2, C=5)        0.912005       0.909423   

   F1 Score  Precision    Recall  Fit Time (s)  Test Time (s)  \
0   0.91711   0.920553  0.913692      0.183622       0.002001   

             Confusion Matrix  
0  [[1898, 201], [220, 2329]]  

Confusion Matrices:
LSA + Logistic Regression Confusion Matrix:
[[1898  201]
 [ 220 2329]]
