**Step 1: Load and Explore Data**

In [None]:
# Step 1: Load and Explore Data
import pandas as pd

# Load dataset
data = pd.read_csv('/content/Disease_symptom_and_patient_profile_dataset.csv')

# Inspect the dataset
print(data.head())
print(data.info())

# Check demographic distribution
print("Gender Distribution:\n", data['Gender'].value_counts())
print("Age Group Distribution:\n", data['Age Group'].value_counts())


       Disease Fever Cough Fatigue Difficulty Breathing  Age Group  Gender  \
0    Influenza   Yes    No     Yes                  Yes         19  Female   
1  Common Cold    No   Yes     Yes                   No         25  Female   
2       Eczema    No   Yes     Yes                   No         25  Female   
3       Asthma   Yes   Yes      No                  Yes         25    Male   
4       Asthma   Yes   Yes      No                  Yes         25    Male   

  Blood Pressure Cholesterol Level Outcome Variable  
0            Low            Normal         Positive  
1         Normal            Normal         Negative  
2         Normal            Normal         Negative  
3         Normal            Normal         Positive  
4         Normal            Normal         Positive  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0

**Step 2: Preprocess the Data**

In [None]:
# Step 2: Preprocess the Data
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Encode categorical variables
label_encoder = LabelEncoder()
for col in data.select_dtypes(include=['object']).columns:
    data[col] = label_encoder.fit_transform(data[col])

# Scale numerical features
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data.drop(columns=['Disease'])), columns=data.columns[:-1])

# Define features and target variable
X = data_scaled
y = data['Disease']  # Assuming 'Disease' is the target variable


**Step 3: Handle Class Imbalance**

In [None]:
# Step 3: Handle Class Imbalance
from imblearn.over_sampling import SMOTE, RandomOverSampler
from collections import Counter

# Check the class distribution before applying SMOTE
print("Original class distribution:", Counter(y))

# Try applying SMOTE with an adjusted k_neighbors value
try:
    smote = SMOTE(random_state=42, k_neighbors=1)  # Adjust k_neighbors for small minority class
    X_resampled, y_resampled = smote.fit_resample(X, y)
    print("Class distribution after SMOTE:", Counter(y_resampled))

except ValueError as e:
    print(f"SMOTE failed: {e}")
    print("Applying Random Oversampling as an alternative...")

    # Alternative: Random Oversampling
    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(X, y)
    print("Class distribution after Random Oversampling:", Counter(y_resampled))


Original class distribution: Counter({6: 23, 101: 16, 77: 14, 32: 10, 69: 10, 51: 10, 56: 8, 13: 8, 84: 8, 53: 7, 55: 7, 24: 6, 37: 6, 1: 6, 4: 6, 42: 6, 92: 6, 31: 6, 60: 6, 57: 6, 75: 6, 81: 5, 112: 5, 111: 5, 70: 5, 90: 5, 61: 5, 58: 5, 26: 5, 27: 5, 82: 5, 2: 5, 20: 4, 48: 3, 113: 3, 108: 3, 62: 3, 30: 2, 93: 2, 65: 2, 59: 2, 91: 2, 16: 2, 46: 2, 109: 2, 115: 2, 104: 2, 18: 2, 36: 2, 63: 2, 45: 2, 110: 2, 67: 2, 49: 2, 71: 2, 72: 1, 98: 1, 9: 1, 15: 1, 23: 1, 52: 1, 73: 1, 7: 1, 21: 1, 39: 1, 74: 1, 22: 1, 25: 1, 100: 1, 0: 1, 11: 1, 28: 1, 43: 1, 35: 1, 41: 1, 54: 1, 64: 1, 8: 1, 50: 1, 68: 1, 79: 1, 14: 1, 83: 1, 95: 1, 97: 1, 3: 1, 38: 1, 96: 1, 99: 1, 34: 1, 80: 1, 85: 1, 5: 1, 40: 1, 66: 1, 47: 1, 87: 1, 102: 1, 12: 1, 76: 1, 86: 1, 19: 1, 88: 1, 105: 1, 10: 1, 78: 1, 107: 1, 29: 1, 33: 1, 17: 1, 89: 1, 94: 1, 44: 1, 103: 1, 106: 1, 114: 1})
SMOTE failed: Expected n_neighbors <= n_samples_fit, but n_neighbors = 2, n_samples_fit = 1, n_samples = 1
Applying Random Oversampling a

**Step 4: Train the Machine Learning Model**

In [None]:
# Step 4: Train the Machine Learning Model
from sklearn.ensemble import RandomForestClassifier

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_resampled, y_resampled)


**Step 5: Evaluate Model Performance**

In [None]:
# Step 5: Evaluate Model Performance
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Predict on the test set
y_pred = model.predict(X)

# Evaluate performance
print("Classification Report:\n", classification_report(y, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y, y_pred))
print("Accuracy Score:", accuracy_score(y, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.86      1.00      0.92         6
           2       1.00      0.80      0.89         5
           3       0.50      1.00      0.67         1
           4       1.00      0.67      0.80         6
           5       1.00      1.00      1.00         1
           6       1.00      0.87      0.93        23
           7       0.50      1.00      0.67         1
           8       0.25      1.00      0.40         1
           9       0.33      1.00      0.50         1
          10       1.00      1.00      1.00         1
          11       0.00      0.00      0.00         1
          12       1.00      1.00      1.00         1
          13       1.00      1.00      1.00         8
          14       1.00      1.00      1.00         1
          15       1.00      1.00      1.00         1
          16       1.00      1.00      1.00         2
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Step 6: Assess Fairness Across Demographics**

In [None]:
# Step 6: Assess Fairness Across Demographics
# Add predictions and demographic columns for evaluation
data['Prediction'] = y_pred

# Analyze performance by gender
print("\nPerformance by Gender:")
for gender in data['Gender'].unique():
    group = data[data['Gender'] == gender]
    accuracy = accuracy_score(group['Disease'], group['Prediction'])
    print(f"Gender {gender} - Accuracy: {accuracy}")

# Analyze performance by age group
print("\nPerformance by Age Group:")
for age_group in data['Age Group'].unique():
    group = data[data['Age Group'] == age_group]
    accuracy = accuracy_score(group['Disease'], group['Prediction'])
    print(f"Age Group {age_group} - Accuracy: {accuracy}")



Performance by Gender:
Gender 0 - Accuracy: 0.8409090909090909
Gender 1 - Accuracy: 0.8786127167630058

Performance by Age Group:
Age Group 19 - Accuracy: 1.0
Age Group 25 - Accuracy: 0.8571428571428571
Age Group 28 - Accuracy: 1.0
Age Group 29 - Accuracy: 0.9090909090909091
Age Group 30 - Accuracy: 0.8571428571428571
Age Group 31 - Accuracy: 1.0
Age Group 32 - Accuracy: 1.0
Age Group 35 - Accuracy: 0.8333333333333334
Age Group 38 - Accuracy: 0.9285714285714286
Age Group 39 - Accuracy: 1.0
Age Group 40 - Accuracy: 0.8064516129032258
Age Group 42 - Accuracy: 0.9375
Age Group 43 - Accuracy: 1.0
Age Group 45 - Accuracy: 0.7380952380952381
Age Group 48 - Accuracy: 1.0
Age Group 50 - Accuracy: 0.8529411764705882
Age Group 52 - Accuracy: 1.0
Age Group 55 - Accuracy: 1.0
Age Group 56 - Accuracy: 1.0
Age Group 57 - Accuracy: 1.0
Age Group 60 - Accuracy: 0.8
Age Group 65 - Accuracy: 0.8260869565217391
Age Group 70 - Accuracy: 0.875
Age Group 80 - Accuracy: 1.0
Age Group 85 - Accuracy: 1.0
Age 

**Step 7: Mitigate Bias**

In [None]:
# Step 7: Bias Mitigation with Fairlearn (Demographic Parity)
!pip install fairlearn  # Ensure Fairlearn is installed
!pip install scikit-learn  # Ensure scikit-learn is installed

from fairlearn.reductions import GridSearch, DemographicParity
from fairlearn.metrics import MetricFrame, selection_rate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

# Ensure target labels (y) are binary (0, 1)
print("Unique values in y before encoding:", np.unique(y))

# Handle missing values in y
y = np.nan_to_num(y, nan=0)  # Replace NaN with 0 for NumPy array

# Ensure y contains only binary values (0, 1) for binary classification
unique_values = np.unique(y)
print("Unique values in y after handling missing data:", unique_values)

if len(unique_values) > 2:
    print("Warning: y contains more than two classes, which may not be suitable for binary classification.")

# Convert target labels (y) to binary (0, 1) if they are not already binary
if not set(unique_values).issubset({0, 1}):
    # Assuming we want to make y binary (if it's not already binary)
    # We collapse it to two classes, 0 and 1 (e.g., mapping all classes to 1 except for the first one)
    y = (y != unique_values[0]).astype(int)

# Check the unique values again to ensure y is binary (0, 1)
print("Unique values in y after encoding:", np.unique(y))

# Define the sensitive feature (e.g., 'Gender') - Ensure 'Gender' is numeric (already encoded)
sensitive_feature = data['Gender']  # Ensure Gender is numeric (encoded earlier)

# Define the fairness constraint for demographic parity
fairness_constraint = DemographicParity()

# Initialize Fairlearn's GridSearch with the fairness constraint
mitigator = GridSearch(estimator=LogisticRegression(random_state=42, max_iter=1000),
                       constraints=fairness_constraint)

# Fit the mitigated model using GridSearch with fairness constraints
mitigator.fit(X, y, sensitive_features=sensitive_feature)

# **Obtain predictions directly from the GridSearch object**
y_mitigated_pred = mitigator.predict(X)

# Evaluate fairness using MetricFrame
metric_frame = MetricFrame(
    metrics={"accuracy": accuracy_score, "selection_rate": selection_rate},
    y_true=y,
    y_pred=y_mitigated_pred,
    sensitive_features=sensitive_feature
)

# Print fairness metrics by group
print("\nFairness Metrics by Group:")
print(metric_frame.by_group)

# Evaluate overall model performance
print("\nClassification Report After Bias Mitigation:")
print(classification_report(y, y_mitigated_pred))

# Calculate overall accuracy
accuracy = accuracy_score(y, y_mitigated_pred)
print(f"Overall Accuracy After Bias Mitigation: {accuracy:.2f}")


Unique values in y before encoding: [0 1]
Unique values in y after handling missing data: [0 1]
Unique values in y after encoding: [0 1]

Fairness Metrics by Group:
        accuracy  selection_rate
Gender                          
0        1.00000             1.0
1        0.99422             1.0

Classification Report After Bias Mitigation:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       1.00      1.00      1.00       348

    accuracy                           1.00       349
   macro avg       0.50      0.50      0.50       349
weighted avg       0.99      1.00      1.00       349

Overall Accuracy After Bias Mitigation: 1.00


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Step 8: Compare and Report Results**

In [None]:
# Step 8: Compare Results
from sklearn.metrics import f1_score

# Compare F1-scores
print("\nF1 Score Before Mitigation:", f1_score(y, y_pred, average='weighted'))
print("F1 Score After Mitigation:", f1_score(y, y_mitigated_pred, average='weighted'))



F1 Score Before Mitigation: 0.041233840483204866
F1 Score After Mitigation: 0.9957040612037673
