In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, f1_score, precision_recall_curve, auc
import pickle

In [None]:
# Load data
train_df = pd.read_csv(r"D:\PCOS\train.csv")
test_df = pd.read_csv(r"D:\PCOS\test.csv")

In [None]:
# Data Overview
print("Train Data Overview:")
print(train_df.info())
print("Test Data Overview:")
print(test_df.info())


In [None]:
train_df.head()

In [None]:
train_df.describe(include='all')

In [None]:
train_df.shape

In [None]:
print("\nMissing Values:")
print(train_df.isnull().sum())

In [None]:
for col in train_df.columns:
    if train_df[col].isnull().any()==True:
        print(train_df[col].value_counts())

In [None]:

# Data Cleaning Function
def clean_data(df):
    # Standardize Column Names
    df.columns = df.columns.str.replace(" ", "_").str.lower()

    # Convert Age Categories to Midpoint
    def convert_age_to_midpoint(age):
        age_mapping = {
            "20-25": 22.5, "15-20": 17.5, "Less than 20": 17.5,
            "35-44": 39.5, "25-30": 27.5, "45 and above": 47.5,
            "30-35": 32.5, "30-25": 27.5, "30-40": 35, "Less than 20-25": 17.5
        }
        return age_mapping.get(age, np.nan)  # Convert to midpoint or NaN if unknown

    df['age'] = df['age'].apply(convert_age_to_midpoint)

    # Standardize Yes/No Responses
    yes_no_cols = ['hirsutism', 'hormonal_imbalance', 'conception_difficulty', 'insulin_resistance']
    replace_dict = {
        "No, Yes, not diagnosed by a doctor": "No",
        "Yes Significantly": "Yes",
        "Yes, diagnosed by a doctor": "Yes"
    }
    df[yes_no_cols] = df[yes_no_cols].replace(replace_dict)
    
    exercise_cols = ['exercise_type']
    exercise_replace_dict = {
        "No Exercise": "No exercise",
        "Cardio (e.g., running, cycling, swimming)": "Cardio",
        "Cardio (e.g.": "Cardio",  
        "Flexibility and balance (e.g., yoga, pilates)": "Flexibility & Balance",
        "Strength training (e.g., weightlifting, resistance exercises)": "Strength training",
        "Cardio (e.g., running, cycling, swimming), Strength training (e.g., weightlifting, resistance exercises)": "Cardio & Strength training",
        "Cardio (e.g., running, cycling, swimming), Flexibility and balance (e.g., yoga, pilates)": "Cardio & Flexibility & Balance",
        "High-intensity interval training (HIIT)": "HIIT",
        "Cardio (e.g., running, cycling, swimming), Strength training (e.g., weightlifting, resistance exercises), Flexibility and balance (e.g., yoga, pilates)": "Cardio & Strength training & Flexibility & Balance",
        "Strength training (e.g., weightlifting, resistance exercises), Flexibility and balance (e.g., yoga, pilates)": "Strength training & Flexibility & Balance",
        "Flexibility and balance (e.g., yoga, pilates), None": "No exercise", 
        "Cardio (e.g., running, cycling, swimming), None": "No exercise",
        "Strength training": "Strength training",  
        "Strength training (e.g.": "Strength training",
        "Somewhat": "No exercise",  
        "Flexibility and balance (e.g.": "Flexibility & Balance"
    }
    df[exercise_cols] = df[exercise_cols].replace(exercise_replace_dict)  

    exercise_benefit_cols = ['exercise_benefit']
    exercise_benefit_mapping = {
        "Not at All": "No",
        "Not Much": "No",
        "Somewhat": "Yes",
        "Yes Significantly": "Yes"
    }
    df[exercise_benefit_cols] = df[exercise_benefit_cols].replace(exercise_benefit_mapping)

    return df

# Apply Cleaning to Train & Test Data
train_df = clean_data(train_df)
test_df = clean_data(test_df)


In [None]:
train_df = train_df[(train_df['weight_kg'] >= 30) & (train_df['weight_kg'] <= 120)]


In [None]:
sns.histplot(train_df['age'], kde=True, bins=30)
plt.title('Histogram of Age')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()


In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(x=train_df['pcos'], y=train_df['weight_kg'])
plt.title("Weight vs. PCOS")
plt.show()


In [None]:
sns.histplot(train_df['weight_kg'], kde=True, bins=30)
plt.title('Histogram of Weight')
plt.xlabel('Weight')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='pcos', data=train_df)
plt.title("PCOS Class Distribution")
plt.show()

In [None]:
# Fill numerical columns with median
train_df.fillna(train_df.median(numeric_only=True), inplace=True)
test_df.fillna(test_df.median(numeric_only=True), inplace=True)

# Fill categorical columns with mode
train_df.fillna(train_df.mode().iloc[0], inplace=True)
test_df.fillna(test_df.mode().iloc[0], inplace=True)

# Check if any missing values remain
print("Missing values in train:\n", train_df.isnull().sum())
print("Missing values in test:\n", test_df.isnull().sum())


In [None]:
# categorical columns in train_df
categorical_cols_train = train_df.select_dtypes(include=['object']).columns

for col in categorical_cols_train:
    if col in train_df.columns:
        # Replace missing values in categorical columns with 'Unknown' in training data
        train_df[col] = train_df[col].fillna('Unknown')

label_encoders = {}
for col in categorical_cols_train:
    if col in train_df.columns:
        le = LabelEncoder()
        # Fit the encoder on the training data and transform the training data
        train_df[col] = le.fit_transform(train_df[col].astype(str))
        label_encoders[col] = le  # Save the encoder for later use

numerical_cols = train_df.select_dtypes(include=['number']).columns

# 3. Handle missing values in numerical columns (if any) and scale them
for col in numerical_cols:
    if col in train_df.columns:
        # Fill missing numerical values with the median (or mean) of the column
        train_df[col] = train_df[col].fillna(train_df[col].median())



In [None]:
for col in categorical_cols_train:
    plt.figure(figsize=(8, 4))
    sns.countplot(x=train_df[col], hue=train_df[col], palette='viridis', legend=False)
    plt.xticks(rotation=45)
    plt.title(f'Distribution of {col}')
    plt.show()  


In [None]:
X = train_df.drop(columns=['id', 'pcos'])  # Exclude 'id' and target column 'pcos'
y = train_df['pcos']

# 4. Scale the numerical columns using the StandardScaler (fit on training data)
scaler = StandardScaler()
train_df[numerical_cols] = scaler.fit_transform(train_df[numerical_cols])
X[X.columns] = scaler.fit_transform(X)


In [None]:
# Train/Test Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=54545)

In [None]:
# Train Models with Hyperparameter Tuning
rf_params =  {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced']   
}

grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=54545), param_grid=rf_params, cv=5, n_jobs=-1, scoring='roc_auc')
grid_search_rf.fit(X_train, y_train)

# Get the best model from grid search
rf = grid_search_rf.best_estimator_

# Evaluate Models

def evaluate_model(model, X_val, y_val, model_name):
    y_prob = model.predict_proba(X_val)[:, 1]
    y_pred = model.predict(X_val)

    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    roc_auc = roc_auc_score(y_val, y_prob)
    precision, recall, _ = precision_recall_curve(y_val, y_prob)
    pr_auc = auc(recall, precision)

    print(f"\n{model_name} Results:")
    print(f"Accuracy: {accuracy:.4f}, ROC AUC: {roc_auc:.4f}, F1 Score: {f1:.4f}, PR AUC: {pr_auc:.4f}")
    print(classification_report(y_val, y_pred))
    
    cm = confusion_matrix(y_val, y_pred)
    ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["No PCOS", "PCOS"]).plot(cmap="Blues")
    plt.title(f"Confusion Matrix - {model_name}")
    plt.show()

    return roc_curve(y_val, y_prob), roc_auc  

rf_fpr_tpr, rf_roc_auc = evaluate_model(rf, X_val, y_val, "Random Forest")

In [None]:
# 3. ROC Curve Comparison
plt.figure(figsize=(8, 6))
plt.plot(rf_fpr_tpr[0], rf_fpr_tpr[1], label=f'Random Forest (AUC={rf_roc_auc:.2f})', color='blue')
plt.plot([0, 1], [0, 1], '--', color='black')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.show()



In [None]:
sorted_idx = np.argsort(rf.feature_importances_)[::-1]
plt.barh(np.array(X.columns)[sorted_idx], rf.feature_importances_[sorted_idx], color='blue')
plt.xlabel("Importance Score")
plt.ylabel("Feature Name")
plt.title("Feature Importances - Random Forest")
plt.gca().invert_yaxis()
plt.show()

In [None]:
test_df.shape

In [None]:
test_df.head

In [None]:
test_ids = test_df['id']

# Ensure only categorical columns that exist in test_df are selected
columns_to_encode = [col for col in categorical_cols_train if col in test_df.columns]

# Fill missing values in categorical test data
test_df[columns_to_encode] = test_df[columns_to_encode].fillna('Unknown')

for col in columns_to_encode:
    le = label_encoders[col]  
    test_df[col] = test_df[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)


X_test = test_df.drop(columns=['id'], errors='ignore')

X_test[X_test.columns] = scaler.transform(X_test)

# Make Predictions
test_predictions = rf.predict_proba(X_test)[:, 1]  

# Save Submission
submission = pd.DataFrame({'ID': test_ids, 'PCOS': test_predictions.round(4)})  
submission.to_csv(r"D:\PCOS\submission.csv", index=False)

# Print confirmation
print(r"Output saved to 'D:\PCOS\submission.csv'")

In [None]:
# Load the trained model
with open("pcos_model.pkl", "rb") as file:
    rf = pickle.load(file)

# Define feature columns
feature_columns = ['age', 'weight_kg', 'hormonal_imbalance', 'hyperandrogenism',
                   'hirsutism', 'conception_difficulty', 'insulin_resistance',
                   'exercise_frequency', 'exercise_type', 'exercise_duration',
                   'sleep_hours', 'exercise_benefit']

# Function to convert Yes/No input to 1/0
def yes_no_input(prompt):
    while True:
        value = input(prompt + " (Yes/No): ").strip().lower()
        if value in ['yes', 'no']:
            return 1 if value == 'yes' else 0
        print("Invalid input. Please enter Yes or No.")

# Get user input
print("Enter patient details to predict PCOS probability:")
inputs = []
inputs.append(float(input("Age: ")))
inputs.append(float(input("Weight (kg): ")))
inputs.append(yes_no_input("Hormonal Imbalance"))
inputs.append(yes_no_input("Hyperandrogenism"))
inputs.append(yes_no_input("Hirsutism"))
inputs.append(yes_no_input("Conception Difficulty"))
inputs.append(yes_no_input("Insulin Resistance"))
inputs.append(int(input("Exercise Frequency (days/week): ")))
inputs.append(int(input("Exercise Type (0-No Exercise, 1-Cardio, 2-Strength training, 3-Flexibility, 4-HIIT): ")))
inputs.append(int(input("Exercise Duration (0-Not Applicable, 1-Less than 30 minutes, 2-30 minutes, 3-45 minutes, 4-More than 30 minutes, 5-20 minutes, 6-Less than 6 hours, 7-30 minutes to 1 hour): ")))
inputs.append(int(input("Sleep Hours per Day (0-3-4 hours, 1-Less than 6 hours, 2-6-8 hours, 3-9-12 hours, 4-More than 12 hours): ")))  

sample_input = pd.DataFrame([inputs], columns=feature_columns)

# Make prediction
prediction = rf.predict_proba(sample_input)[:, 1][0]

# Show result
print(f"\nPredicted PCOS Probability: {round(prediction, 4)}")

# Interpretation
if prediction >= 0.5:
    print("High risk of PCOS. Please consult a doctor.")
else:
    print("Low risk of PCOS.")
