# Exploring the overlap of Polycystic Ovary Syndrome and Type II Diabetes through clustering of metabolic risk factors

Applied Physics 157  | Gavan, Mesias

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# Preprocessing

In [None]:
# load dataset
df = pd.read_csv('metabolic risk assessment.csv')

# dropping identifiers
df.drop(columns=['Sl. No', 'Patient File No.', 'Unnamed: 0'], inplace=True, errors='ignore')

# encode PCOS diagnosis and Diabetes diagnosis
df['PCOS'] = df['PCOS'].map({'Yes': 1, 'No': 0})
df['Diabetes_Status'] = df['Diabetes_Status'].astype(str).str.strip().str.lower()
df['Diabetes'] = df['Diabetes_Status'].map({'positive': 1, 'negative': 0})
df.drop(columns=['Diabetes_Status'], inplace=True, errors='ignore')

## Filtering for Likely Females and Strictly Females

### Likely Females

In [None]:
# Likely females defined as PCOS info present (any value) OR Gestational Diabetes is present
df['Likely_Female'] = df['PCOS'].notna() | (df['Gestational_Diabetes'] == 1)
df_females = df[df['Likely_Female']].copy()

## Feature Engineering

In [None]:
def bmi_category(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif bmi < 25:
        return 'Normal'
    elif bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'

def bp_category(sys, dia):
    if sys > 140 or dia > 90:
        return 'Hypertension'
    elif sys > 120 or dia > 80:
        return 'Prehypertension'
    else:
        return 'Normal'

df_females['BMI_cat'] = df_females['BMI'].apply(bmi_category)
df_females['Insulin_Resistant'] = (df_females['HOMA_IR'] > 2.5).astype(int)
df_females['BP_cat'] = df_females.apply(lambda x: bp_category(x['Blood_Pressure_Systolic'], x['Blood_Pressure_Diastolic']), axis=1)

# Dimensionality Reduction

In [None]:
# numeric features
df_numeric = df_females.select_dtypes(include=[np.number])
features_df = df_numeric.drop(columns=['PCOS', 'Diabetes'], errors='ignore')
target_pcos = df_numeric['PCOS']
target_diabetes = df_numeric['Diabetes']

# impute and scale numeric data
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features_df)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features_imputed)

# Principal Component Analysis
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)

# Clustering

In [None]:
# clustering with 4 clusters using KMeans
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(X_pca)
df_females['Cluster'] = clusters

# t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X_pca)
df_females['tSNE-1'] = X_tsne[:, 0]
df_females['tSNE-2'] = X_tsne[:, 1]

# labelling
df_females['Diabetes_Label'] = df_females['Diabetes'].map({0: 'Diabetes: Negative', 1: 'Diabetes: Positive'})
df_females['PCOS_Label'] = df_females['PCOS'].map({0: 'PCOS: No', 1: 'PCOS: Yes'})

# custom color palette for t-SNE datapoints
custom_palette = {
    'Diabetes: Negative': '#8dd3c7',  # Teal
    'Diabetes: Positive': '#e78ac3'   # Pink
}

# 2D visualization with t-SNE
plt.figure(figsize=(10, 7))
sns.scatterplot(
    x='tSNE-1', y='tSNE-2',
    hue='Diabetes_Label', style='PCOS_Label',
    data=df_females, palette=custom_palette, s=40, alpha=0.7
)
plt.title("t-SNE: Diabetes (color) & PCOS (shape) - Likely Females")
plt.legend(title='Diagnosis')
plt.show()

# Clustering Analysis (correlation, average rates, co-incidence matrix)

In [None]:
# PCOS and Type 2 Diabetes correlation per cluster
print("=== Correlation between PCOS and Diabetes within Clusters (Likely Females) ===")
for cluster_id, group in df_females.groupby('Cluster'):
    if group['PCOS'].nunique() > 1 and group['Diabetes'].nunique() > 1:
        corr = group[['PCOS', 'Diabetes']].corr().iloc[0, 1]
        print(f"Cluster {cluster_id}: {corr:.2f}")
    else:
        print(f"Cluster {cluster_id}: Not computable")

# Co-incidence matrix
co_matrix = pd.crosstab(df_females['Cluster'], [df_females['PCOS'], df_females['Diabetes']],
                        rownames=['Cluster'], colnames=['PCOS', 'Diabetes'])
print("\n=== PCOS-Diabetes Co-incidence Matrix by Cluster (Likely Females) ===")
print(co_matrix)

# Average rates per cluster
cluster_summary = df_females.groupby('Cluster')[['PCOS', 'Diabetes']].mean()
print("=== Cluster-wise Average Rates (Likely Females) ===")
print(cluster_summary)

# Classification

In [None]:
# 80:20 train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, target_diabetes, test_size=0.2, random_state=42)

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_test)

# classification reports for each model
report_lr = classification_report(y_test, lr_preds, output_dict=True)
report_rf = classification_report(y_test, rf_preds, output_dict=True)
report_xgb = classification_report(y_test, xgb_preds, output_dict=True)

# Bonus: for Strictly Females

In [None]:
# Strictly female defined as PCOS = 1 OR Gestational Diabetes = 1
strict_females = df[(df['PCOS'] == 1) | (df['Gestational_Diabetes'] == 1)].copy()

# numeric features
df_numeric = strict_females.select_dtypes(include=[np.number])
features_df = df_numeric.drop(columns=['PCOS', 'Diabetes'], errors='ignore')
target_pcos = df_numeric['PCOS']
target_diabetes = df_numeric['Diabetes']

# impute and scale numeric data
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features_df)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features_imputed)

# DIMENSIONALITY REDUCTION
# Principal Component Analysis
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)

# clustering with 4 clusters using KMeans
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(X_pca)
strict_females['Cluster'] = clusters

# t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X_pca)
strict_females['tSNE-1'] = X_tsne[:, 0]
strict_females['tSNE-2'] = X_tsne[:, 1]

# t-SNE labels
strict_females['Diabetes_Label'] = strict_females['Diabetes'].map({0: 'Diabetes: Negative', 1: 'Diabetes: Positive'})
strict_females['PCOS_Label'] = strict_females['PCOS'].map({0: 'PCOS: No', 1: 'PCOS: Yes'})

# cluster-wise average rates
cluster_summary = strict_females.groupby('Cluster')[['PCOS', 'Diabetes']].mean()

# co-incidence matrix
co_matrix = pd.crosstab(strict_females['Cluster'], [strict_females['PCOS'], strict_females['Diabetes']],
                        rownames=['Cluster'], colnames=['PCOS', 'Diabetes'])

# PCOS and Type 2 Diabetes diagnoses correlation within clusters
correlations = {}
for cluster_id, group in strict_females.groupby('Cluster'):
    if group['PCOS'].nunique() > 1 and group['Diabetes'].nunique() > 1:
        correlations[cluster_id] = round(group[['PCOS', 'Diabetes']].corr().iloc[0, 1], 2)
    else:
        correlations[cluster_id] = 'Not computable'

# results for clustering & classification analysis
{
    "num_strict_entries": len(strict_females),
    "classification_reports": {
        "Logistic Regression": report_lr,
        "Random Forest": report_rf,
        "SVM": report_svm
    },
    "cluster_summary": cluster_summary,
    "co_matrix": co_matrix,
    "correlations": correlations
}

# Classification Models
X_train, X_test, y_train, y_test = train_test_split(X_scaled, target_diabetes, test_size=0.2, random_state=42)

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

# XGBoost Classifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_test)

# classification reports for each model
report_lr = classification_report(y_test, lr_preds, output_dict=True)
report_rf = classification_report(y_test, rf_preds, output_dict=True)
report_xgb = classification_report(y_test, xgb_preds, output_dict=True)

# Bonus: Feature Importance Ranking

## Feature Importance for PCOS

In [None]:
meta_df = "metabolic risk assessment.csv"
meta_df_subset = pd.read_csv(meta_df, usecols=selected_features + ["PCOS"], low_memory=False)

# clean dataframe
meta_df_subset.dropna(inplace=True)

# separate features from target variable
X_subset = meta_df_subset[selected_features]
y_subset = meta_df_subset["PCOS"]

selected_features = [
 'Fasting_Blood_Glucose',
 'Postprandial_Blood_Glucose',
 'HbA1c',
 'Random_Blood_Glucose',
 'BMI',
 'Waist_Circumference',
 'Triglyceride_Levels',
 'Blood_Pressure_Systolic',
 'Blood_Pressure_Diastolic',
 'LDL_Cholesterol',
 'HDL_Cholesterol',
 'CRP_Levels',
 'Insulin_Levels',
 'HOMA_IR',
 'OGTT',
 'Creatinine_Levels',
 'eGFR',
 'Microalbuminuria',
 'Uric_Acid_Levels',
 'Fructosamine_Levels',
 'ALT',
 'AST',
 'C_Peptide',
 'Proinsulin_Levels',
 'Family_History_of_Diabetes',
 'Gestational_Diabetes',
 'Hypertension',
 'Physical_Activity',
 'Smoking',
 'Alcohol_Consumption',
 'Obesity',
 'Diet',
 'Sleep_Apnea',
 'Diabetes_Status'
]

# Load full dataset with necessary features
meta_df = "metabolic risk assessment.csv"
df = pd.read_csv(meta_df, usecols=selected_features + ["PCOS"], low_memory=False)
df.dropna(inplace=True)

# separate selected features from target variable
X_subset = df[selected_features]
y_subset = df["PCOS"]

# encode target variable
le = LabelEncoder()
y_subset_encoded = le.fit_transform(y_subset)

# encode any categorical features in X_subset
categorical_cols_X = X_subset.select_dtypes(include='object').columns
for col in categorical_cols_X:
    X_subset[col] = le.fit_transform(X_subset[col])

# train Random Forest
rf_subset = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_subset.fit(X_subset, y_subset_encoded)

# feature importances
feature_importance_subset = pd.Series(rf_subset.feature_importances_, index=selected_features).sort_values(ascending=False)

# Output top features
print("Top Features Predicting PCOS in Likely Females:")
print(feature_importance_subset)

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x=feature_importance_subset.values, y=feature_importance_subset.index)
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Feature Importance for 'PCOS' Prediction")
plt.show()

## Feature Importance for Type II Diabetes

In [None]:
# Define relevant features for selection
selected_features = [
 'PCOS',
 'Fasting_Blood_Glucose',
 'Postprandial_Blood_Glucose',
 'HbA1c',
 'Random_Blood_Glucose',
 'BMI',
 'Waist_Circumference',
 'Triglyceride_Levels',
 'Blood_Pressure_Systolic',
 'Blood_Pressure_Diastolic',
 'LDL_Cholesterol',
 'HDL_Cholesterol',
 'CRP_Levels',
 'Insulin_Levels',
 'HOMA_IR',
 'OGTT',
 'Creatinine_Levels',
 'eGFR',
 'Microalbuminuria',
 'Uric_Acid_Levels',
 'Fructosamine_Levels',
 'ALT',
 'AST',
 'C_Peptide',
 'Proinsulin_Levels',
 'Family_History_of_Diabetes',
 'Gestational_Diabetes',
 'Hypertension',
 'Physical_Activity',
 'Smoking',
 'Alcohol_Consumption',
 'Obesity',
 'Diet',
 'Sleep_Apnea',
]

meta_df = "metabolic risk assessment.csv"
meta_df_subset = pd.read_csv(meta_df, usecols=selected_features + ["Diabetes_Status"], low_memory=False)

meta_df_subset.dropna(inplace=True)

# separate selected features from target variable
X_subset = meta_df_subset[selected_features]
y_subset = meta_df_subset["Diabetes_Status"]


le = LabelEncoder()
y_subset_encoded = le.fit_transform(y_subset)
categorical_cols_X = X_subset.select_dtypes(include='object').columns

# encode labels to each categorical column in X_subset
for col in categorical_cols_X:
    X_subset[col] = le.fit_transform(X_subset[col])

# train Random Forest model on this subset for feature importance
rf_subset = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
# fit model with the processed feature data and encoded target variable
rf_subset.fit(X_subset, y_subset_encoded)


feature_importance_subset = pd.Series(rf_subset.feature_importances_, index=selected_features).sort_values(ascending=False)
feature_importance_subset

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x=feature_importance_subset.values, y=feature_importance_subset.index)
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Feature Importance for 'Diabetes' Prediction")
plt.show()