# Feature Engineering for Ocular Disease Dataset

In [1]:
import pandas as pd

# Load the dataset (Update the path if necessary)
dataset_path = "ocular-disease-recognition/full_df.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(dataset_path)

# Display basic dataset information
print("Dataset loaded successfully. Shape:", df.shape)
df.head()


Dataset loaded successfully. Shape: (6392, 19)


Unnamed: 0,ID,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O,filepath,labels,target,filename
0,0,69,Female,0_left.jpg,0_right.jpg,cataract,normal fundus,0,0,0,1,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",0_right.jpg
1,1,57,Male,1_left.jpg,1_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",1_right.jpg
2,2,42,Male,2_left.jpg,2_right.jpg,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",2_right.jpg
3,4,53,Male,4_left.jpg,4_right.jpg,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4_right.jpg
4,5,50,Female,5_left.jpg,5_right.jpg,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",5_right.jpg


In [2]:
# Drop non-informative columns (ID column)
df_cleaned = df.drop(columns=['ID'], errors='ignore')

print("Dropped ID column. New shape:", df_cleaned.shape)
df_cleaned.head()


Dropped ID column. New shape: (6392, 18)


Unnamed: 0,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O,filepath,labels,target,filename
0,69,Female,0_left.jpg,0_right.jpg,cataract,normal fundus,0,0,0,1,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",0_right.jpg
1,57,Male,1_left.jpg,1_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",1_right.jpg
2,42,Male,2_left.jpg,2_right.jpg,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",2_right.jpg
3,53,Male,4_left.jpg,4_right.jpg,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4_right.jpg
4,50,Female,5_left.jpg,5_right.jpg,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",5_right.jpg


### **Dropping Irrelevant Columns**
- The **ID** column is a unique identifier and does not provide predictive value.
- Removing it prevents unnecessary noise in our model.

In [3]:
# Handling missing values by filling with median for numerical and mode for categorical
for col in df_cleaned.columns:
    if df_cleaned[col].dtype == 'object':
        df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].mode()[0])  # Assign back
    else:
        df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].median())  # Assign back

print("Missing values filled using mode (categorical) and median (numerical).")
df_cleaned.isnull().sum()


Missing values filled using mode (categorical) and median (numerical).


Patient Age                  0
Patient Sex                  0
Left-Fundus                  0
Right-Fundus                 0
Left-Diagnostic Keywords     0
Right-Diagnostic Keywords    0
N                            0
D                            0
G                            0
C                            0
A                            0
H                            0
M                            0
O                            0
filepath                     0
labels                       0
target                       0
filename                     0
dtype: int64

### **Handling Missing Values**
- Numerical features are filled using the **median**, which is robust against outliers.
- Categorical features are filled with the **mode** (most frequent value), ensuring consistency.
- This prevents missing data from impacting model training.

In [4]:
from sklearn.preprocessing import LabelEncoder

# Columns to exclude from encoding and scaling
exclude_cols = ['Left-Fundus', 'Right-Fundus']

# Encode categorical columns
df_encoded = df_cleaned.copy()
label_encoders = {}

for col in df_encoded.select_dtypes(include=['object']).columns.difference(exclude_cols):
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le  # Store encoder for later use

print("Categorical variables encoded.")
df_encoded.head()


Categorical variables encoded.


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O,filepath,labels,target,filename
0,69,0,0_left.jpg,0_right.jpg,4,151,0,0,0,1,0,0,0,0,1,6,7,1
1,57,1,1_left.jpg,1_right.jpg,146,151,1,0,0,0,0,0,0,0,1169,6,7,1169
2,42,1,2_left.jpg,2_right.jpg,76,110,0,1,0,0,0,0,0,1,2920,2,6,2920
3,53,1,4_left.jpg,4_right.jpg,94,102,0,1,0,0,0,0,0,1,5432,2,6,5432
4,50,0,5_left.jpg,5_right.jpg,119,110,0,1,0,0,0,0,0,0,5626,2,6,5626


### **Encoding Categorical Variables**
- **Label Encoding** is applied to categorical variables, converting them into numerical representations.
- This ensures that models can process categorical information effectively.
- Label encoders are stored for future use (e.g., inverse transforming predictions).

In [5]:
from sklearn.preprocessing import StandardScaler

# Columns to exclude from encoding and scaling
exclude_cols = ['Left-Fundus', 'Right-Fundus', 'labels']

# Select numerical columns excluding the specified columns
numerical_cols = df_encoded.select_dtypes(include=['number']).columns.difference(exclude_cols)

scaler = StandardScaler()
df_scaled = df_encoded.copy()
df_scaled[numerical_cols] = scaler.fit_transform(df_scaled[numerical_cols])

print("Numerical features standardized.")
df_scaled.head()

Numerical features standardized.


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O,filepath,labels,target,filename
0,0.950134,-1.074076,0_left.jpg,0_right.jpg,-2.358246,0.669366,-0.699735,-0.705199,-0.257336,3.860116,-0.229189,-0.181108,-0.224231,-0.574942,-1.731238,6,0.726358,-1.731238
1,-0.073161,0.931033,1_left.jpg,1_right.jpg,0.613359,0.669366,1.429112,-0.705199,-0.257336,-0.25906,-0.229189,-0.181108,-0.224231,-0.574942,-1.098248,6,0.726358,-1.098248
2,-1.35228,0.931033,2_left.jpg,2_right.jpg,-0.851516,-0.183236,-0.699735,1.418039,-0.257336,-0.25906,-0.229189,-0.181108,-0.224231,1.739307,-0.149305,2,0.308344,-0.149305
3,-0.41426,0.931033,4_left.jpg,4_right.jpg,-0.474834,-0.349598,-0.699735,1.418039,-0.257336,-0.25906,-0.229189,-0.181108,-0.224231,1.739307,1.212056,2,0.308344,1.212056
4,-0.670083,-1.074076,5_left.jpg,5_right.jpg,0.048336,-0.183236,-0.699735,1.418039,-0.257336,-0.25906,-0.229189,-0.181108,-0.224231,-0.574942,1.317193,2,0.308344,1.317193


### **Feature Scaling**
- **Standardization (Z-score normalization)** is applied to numerical features.
- This ensures that features are on the same scale, preventing some from dominating the model.
- Standardization is beneficial for algorithms that rely on distance measures (e.g., SVM, k-NN).

In [8]:
from sklearn.feature_selection import VarianceThreshold

# Columns to exclude from encoding and scaling
exclude_cols = ['Left-Fundus', 'Right-Fundus', 'labels']

# Select numerical columns excluding the specified columns
numerical_cols = df_encoded.select_dtypes(include=['number']).columns.difference(exclude_cols)

selector = VarianceThreshold(threshold=0.01)  # Remove features with low variance
df_selected = pd.DataFrame(selector.fit_transform(df_scaled[numerical_cols]), columns=numerical_cols[selector.get_support()])

# Add back the excluded columns
df_selected = pd.concat([df_selected, df_encoded[exclude_cols]], axis=1)

print("Low-variance features removed. New shape:", df_selected.shape)
df_selected.head()


Low-variance features removed. New shape: (6392, 18)


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,A,C,D,G,H,Left-Diagnostic Keywords,M,N,O,Patient Age,Patient Sex,Right-Diagnostic Keywords,filename,filepath,target,Left-Fundus,Right-Fundus,labels
0,-0.229189,3.860116,-0.705199,-0.257336,-0.181108,-2.358246,-0.224231,-0.699735,-0.574942,0.950134,-1.074076,0.669366,-1.731238,-1.731238,0.726358,0_left.jpg,0_right.jpg,6
1,-0.229189,-0.25906,-0.705199,-0.257336,-0.181108,0.613359,-0.224231,1.429112,-0.574942,-0.073161,0.931033,0.669366,-1.098248,-1.098248,0.726358,1_left.jpg,1_right.jpg,6
2,-0.229189,-0.25906,1.418039,-0.257336,-0.181108,-0.851516,-0.224231,-0.699735,1.739307,-1.35228,0.931033,-0.183236,-0.149305,-0.149305,0.308344,2_left.jpg,2_right.jpg,2
3,-0.229189,-0.25906,1.418039,-0.257336,-0.181108,-0.474834,-0.224231,-0.699735,1.739307,-0.41426,0.931033,-0.349598,1.212056,1.212056,0.308344,4_left.jpg,4_right.jpg,2
4,-0.229189,-0.25906,1.418039,-0.257336,-0.181108,0.048336,-0.224231,-0.699735,-0.574942,-0.670083,-1.074076,-0.183236,1.317193,1.317193,0.308344,5_left.jpg,5_right.jpg,2


### **Feature Selection (Low Variance Removal)**
- Features with very low variance contribute little to the predictive power of the model.
- Using **VarianceThreshold**, we remove features that have nearly constant values across samples.
- This helps improve computational efficiency and avoids overfitting.

In [7]:
# Save cleaned and processed dataset
df_selected.to_csv("processed_ocular_disease.csv", index=False)
print("Feature engineering complete. Processed dataset saved as 'processed_ocular_disease.csv'.")


Feature engineering complete. Processed dataset saved as 'processed_ocular_disease.csv'.


### **Final Processed Dataset**
- The final dataset is saved as `processed_ocular_disease.csv` for use in modeling.
- All irrelevant columns have been dropped, missing values handled, categorical features encoded, and numerical values standardized.
- The dataset is now **ready for machine learning!** 