In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier


In [None]:
data_path = 'data/processed/combined_labeled_standardized.csv'
df = pd.read_csv(data_path)
# df =df.dropna(axis='columns')
df = df.dropna(axis=1)  # Drop columns with all NaN values

In [None]:
X = df.drop(['patient_id', 'healthy'], axis=1)
y = df['healthy']
# Create train, validation, and test sets
# First split into temp training (80%) and test (20%)
X_train_full, X_val, y_train_full, y_val = train_test_split(X, y, test_size=0.1, random_state=42)




In [None]:
import pandas as pd


# Step 1: Split into train, validation, and test
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

X_train

components = 20
# Create a pipeline with PCA and various classifiers
models = {
    'Decision Tree': Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=components)),
        ('classifier', DecisionTreeClassifier())
    ]),
    'Random Forest': Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=components)),
        ('classifier', RandomForestClassifier())    
    ]),
    'SVM': Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=components)),
        ('classifier', SVC(probability=True))
    ]),
    'KNN': Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=components)),
        ('classifier', KNeighborsClassifier())
    ]),
    'Gradient Boosting': Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=components)),
        ('classifier', GradientBoostingClassifier())
    ]),
    'Neural Network': Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=components)),
        ('classifier', MLPClassifier(max_iter=1000))
    ])
}
results = {}
model_names = list(models.keys())
accuracies = []
std_devs = []
validation_accuracies = []
test_accuracies = []

for name, pipeline in models.items():
# Step 3: Cross-validation on training set
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
    print("Cross-validation scores:", cv_scores)
    print("Mean CV accuracy:", np.mean(cv_scores))

    # Step 4: Fit on training data and evaluate on validation
    pipeline.fit(X_train, y_train)
    val_preds = pipeline.predict(X_val)
    print("Validation Accuracy:", accuracy_score(y_val, val_preds))
    print("Validation Classification Report:\n", classification_report(y_val, val_preds))

    # Step 5: Final test evaluation (optional, only after tuning)
    test_preds = pipeline.predict(X_test)
    print("Test Accuracy:", accuracy_score(y_test, test_preds))
    print("Test Classification Report:\n", classification_report(y_test, test_preds))
    mean_accuracy = cv_scores.mean()
    std_accuracy = cv_scores.std()
    val_accuracy = accuracy_score(y_val, val_preds)
    
    results[name] = {
            'mean_accuracy': mean_accuracy,
            'std_accuracy': std_accuracy,
            'validation_accuracy': val_accuracy,
        }
    
    # Store in lists for plotting
    accuracies.append(mean_accuracy)
    std_devs.append(std_accuracy)
    validation_accuracies.append(val_accuracy)
    # Create a DataFrame to summarize results
    
 # Create a comparison plot
plt.figure(figsize=(12, 8))

# Create bar chart with cross-validation accuracy
bars = plt.bar(model_names, accuracies, yerr=std_devs, capsize=10, alpha=0.7, color='skyblue', label='Cross-validation Accuracy')

# Add validation accuracy as a separate point
plt.plot(model_names, validation_accuracies, 'ro', markersize=8, label='Validation Accuracy')

# Add test accuracy as another point
# plt.plot(model_names, test_accuracies, 'go', markersize=8, label='Test Accuracy')

# Add text labels for cross-validation accuracy
for i, (bar, acc) in enumerate(zip(bars, accuracies)):
    plt.text(i, acc/2, f'{acc:.2f}', ha='center', va='center', color='white', fontweight='bold')

# Add text labels for validation and test accuracy
for i, v in enumerate(validation_accuracies):
    plt.text(i, v + 0.03, f'{v:.2f}', ha='center', fontsize=10, color='red')
# for i, t in enumerate(test_accuracies):
#     plt.text(i, t - 0.03, f'{t:.2f}', ha='center', fontsize=10, color='green')

plt.title(f'Model Comparison with PCA ({components} components)', fontsize=14, fontweight='bold')
plt.ylabel('Accuracy', fontsize=12)
plt.xlabel('Models', fontsize=12)
plt.ylim(0, 1.1)  # Adjusted to fit the text labels
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.legend(loc='upper right')
plt.tight_layout()


import matplotlib.pyplot as plt

# Create a dataframe from the results
results_df = pd.DataFrame({
    'Model': model_names,
    'Cross-validation Accuracy': accuracies,
    'Validation Accuracy': validation_accuracies
})

# Create a figure with a decent size
plt.figure(figsize=(14, 8))

# Set width of bars
bar_width = 0.35
index = range(len(model_names))

# Create the bars
plt.bar([i - bar_width/2 for i in index], results_df['Cross-validation Accuracy'], 
        width=bar_width, label='CV Accuracy', color='skyblue', alpha=0.8)
plt.bar([i + bar_width/2 for i in index], results_df['Validation Accuracy'], 
        width=bar_width, label='Validation Accuracy', color='lightcoral', alpha=0.8)

# Add value labels on the bars
for i, v in enumerate(results_df['Cross-validation Accuracy']):
    plt.text(i - bar_width/2, v + 0.02, f'{v:.2f}', ha='center', fontsize=10)
    
for i, v in enumerate(results_df['Validation Accuracy']):
    plt.text(i + bar_width/2, v + 0.02, f'{v:.2f}', ha='center', fontsize=10)

# Add labels, title and legend
plt.xlabel('Models', fontsize=12)
plt.ylabel('Accuracy Score', fontsize=12)
plt.title('Model Performance Comparison', fontsize=14, fontweight='bold')
plt.xticks(index, model_names, rotation=45, ha='right')
plt.ylim(0.8, 1.1)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import f1_score

# Calculate F1 scores for each model
f1_scores = []

for name, model in models.items():
    # Get model predictions on validation set
    val_predictions = model.predict(X_val)
    
    # Calculate F1 score
    f1 = f1_score(y_val, val_predictions, average='binary')
    f1_scores.append(f1)
    print(f"{name} - F1 Score: {f1:.4f}")

# Create a figure with a decent size
plt.figure(figsize=(14, 8))

# Set width of bars
bar_width = 0.25
index = np.arange(len(model_names))

# Create the bars for CV Accuracy, Validation Accuracy, and F1 Score
plt.bar(index - bar_width / 2, accuracies, width=bar_width, label='CV Accuracy', color='skyblue', alpha=0.8)
plt.bar(index + bar_width / 2, f1_scores, width=bar_width, label='F1 Score', color='lightgreen', alpha=0.8)

# Add value labels on the bars
for i, v in enumerate(accuracies):
    plt.text(i - bar_width / 2, v + 0.02, f'{v:.2f}', ha='center', fontsize=9)
    
for i, v in enumerate(f1_scores):
    plt.text(i + bar_width / 2, v + 0.02, f'{v:.2f}', ha='center', fontsize=9)

# Add labels, title and legend
plt.xlabel('Models', fontsize=12)
plt.ylabel('Score', fontsize=12)
plt.title('Model Performance Comparison - F1 Score', fontsize=14, fontweight='bold')
plt.xticks(index, model_names, rotation=45, ha='right')
plt.ylim(0, 1.1)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

In [None]:

pca = PCA(n_components=1).fit(X)

# Get the PCA components (loadings)
components = pca.components_

# Create a DataFrame of feature importances
component_df = pd.DataFrame()
component_df['Feature'] = X.columns
component_df['Weight'] = np.abs(components[0])  # Take absolute values for importance

# Sort by absolute weight
component_df = component_df.sort_values('Weight', ascending=False)

# Display top features
top_n = 20  # Number of top features to display
print(f"Top {top_n} features contributing to the first PCA component:")
print(component_df.head(top_n))

# Plot top features
plt.figure(figsize=(10, 8))
plt.barh(component_df['Feature'].head(top_n)[::-1], component_df['Weight'].head(top_n)[::-1])
plt.xlabel('Absolute Weight in First Principal Component')
plt.title(f'Top {top_n} Features in PCA')
plt.tight_layout()
plt.show()


# Get the top 20 features from the component dataframe
top_features = component_df.head(top_n)['Feature'].values

# Create a dataframe with just these top features
top_features_df = df[np.append(top_features, 'healthy')]

# Calculate mean values for each group
healthy_means = top_features_df[top_features_df['healthy'] == 1][top_features].mean()
unhealthy_means = top_features_df[top_features_df['healthy'] == 0][top_features].mean()

# Create a comparison dataframe
comparison_df = pd.DataFrame({
    'Feature': top_features,
    'Healthy': healthy_means.values,
    'Non-Healthy': unhealthy_means.values
})

# Melt the dataframe for easier plotting with seaborn
melted_df = pd.melt(comparison_df, id_vars=['Feature'], var_name='Group', value_name='Mean Expression')

# Create the plot
plt.figure(figsize=(14, 10))
sns.barplot(x='Mean Expression', y='Feature', hue='Group', data=melted_df)
plt.title('Mean Expression of Top 20 Features by Health Status', fontsize=14)
plt.xlabel('Mean Expression Value', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.tight_layout()
plt.legend(title='Group')
plt.show()