# Feature Engineering for Ocular Disease Dataset

In [1]:
import pandas as pd

# Load the dataset (Update the path if necessary)
dataset_path = "ocular-disease-recognition/full_df.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(dataset_path)

# Display basic dataset information
print("Dataset loaded successfully. Shape:", df.shape)
df.head()






In [2]:
# Drop non-informative columns (ID column)
df_cleaned = df.drop(columns=['ID'], errors='ignore')

print("Dropped ID column. New shape:", df_cleaned.shape)
df_cleaned.head()






### **Dropping Irrelevant Columns**
- The **ID** column is a unique identifier and does not provide predictive value.
- Removing it prevents unnecessary noise in our model.

In [3]:
# Handling missing values by filling with median for numerical and mode for categorical
for col in df_cleaned.columns:
    if df_cleaned[col].dtype == 'object':
        df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].mode()[0])  # Assign back
    else:
        df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].median())  # Assign back

print("Missing values filled using mode (categorical) and median (numerical).")
df_cleaned.isnull().sum()






### **Handling Missing Values**
- Numerical features are filled using the **median**, which is robust against outliers.
- Categorical features are filled with the **mode** (most frequent value), ensuring consistency.
- This prevents missing data from impacting model training.

In [4]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
df_encoded = df_cleaned.copy()
label_encoders = {}

for col in df_encoded.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le  # Store encoder for later use

print("Categorical variables encoded.")
df_encoded.head()








### **Encoding Categorical Variables**
- **Label Encoding** is applied to categorical variables, converting them into numerical representations.
- This ensures that models can process categorical information effectively.
- Label encoders are stored for future use (e.g., inverse transforming predictions).

In [None]:
from sklearn.preprocessing import StandardScaler

# Columns to exclude from encoding and scaling
exclude_cols = ['Left-Fundus', 'Right-Fundus', 'labels']

# Select numerical columns excluding the specified columns
numerical_cols = df_encoded.select_dtypes(include=['number']).columns.difference(exclude_cols)

scaler = StandardScaler()
df_scaled = df_encoded.copy()
df_scaled[numerical_cols] = scaler.fit_transform(df_scaled[numerical_cols])

print("Numerical features standardized.")
df_scaled.head()





### **Feature Scaling**
- **Standardization (Z-score normalization)** is applied to numerical features.
- This ensures that features are on the same scale, preventing some from dominating the model.
- Standardization is beneficial for algorithms that rely on distance measures (e.g., SVM, k-NN).

In [None]:
from sklearn.feature_selection import VarianceThreshold

# Columns to exclude from encoding and scaling
exclude_cols = ['Left-Fundus', 'Right-Fundus', 'labels']

# Select numerical columns excluding the specified columns
numerical_cols = df_encoded.select_dtypes(include=['number']).columns.difference(exclude_cols)

selector = VarianceThreshold(threshold=0.01)  # Remove features with low variance
df_selected = pd.DataFrame(selector.fit_transform(df_scaled[numerical_cols]), columns=numerical_cols[selector.get_support()])

# Add back the excluded columns
df_selected = pd.concat([df_selected, df_encoded[exclude_cols]], axis=1)

print("Low-variance features removed. New shape:", df_selected.shape)
df_selected.head()




### **Feature Selection (Low Variance Removal)**
- Features with very low variance contribute little to the predictive power of the model.
- Using **VarianceThreshold**, we remove features that have nearly constant values across samples.
- This helps improve computational efficiency and avoids overfitting.

In [13]:
# Save cleaned and processed dataset
df_selected.to_csv("processed_ocular_disease.csv", index=False)
print("Feature engineering complete. Processed dataset saved as 'processed_ocular_disease.csv'.")




### **Final Processed Dataset**
- The final dataset is saved as `processed_ocular_disease.csv` for use in modeling.
- All irrelevant columns have been dropped, missing values handled, categorical features encoded, and numerical values standardized.
- The dataset is now **ready for machine learning!** 