In [2]:
import zipfile
import os

with zipfile.ZipFile("images.zip", "r") as zip_ref:
    zip_ref.extractall(".")

In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical

import xgboost as xgb

In [4]:


IMG_SIZE = (128, 128)
image_dir = 'images'

image_data = []
image_labels = []
image_filenames = []

for class_label in ['infected', 'not_infected']:
    folder_path = os.path.join(image_dir, class_label)
    for i, filename in enumerate(os.listdir(folder_path)):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            filepath = os.path.join(folder_path, filename)
            img = load_img(filepath, target_size=IMG_SIZE)
            img_array = img_to_array(img) / 255.0
            image_data.append(img_array)
            image_labels.append(1 if class_label == 'infected' else 0)
            image_filenames.append(f"{class_label}_{i}")

# Create a labels dataframe
labels_df = pd.DataFrame({
    'filename': image_filenames,
    'label': image_labels
})

X_img = np.array(image_data)
y_img = np.array(image_labels)


In [5]:
df = pd.read_csv('hormone_data.csv')

# Binary columns
binary_cols = ['Pregnant(Y/N)', 'Weight gain(Y/N)', 'hair growth(Y/N)',
               'Skin darkening (Y/N)', 'Hair loss(Y/N)', 'Pimples(Y/N)',
               'Fast food (Y/N)', 'Reg.Exercise(Y/N)', 'PCOS (Y/N)']
ri_cols = ['Cycle(R/I)']

# Replace values
df[binary_cols] = df[binary_cols].replace({'Y': 1, 'N': 0})
df[ri_cols] = df[ri_cols].replace({'R': 1, 'I': 0})

# Drop non-numeric
df = df.drop(columns=['Blood Group'])

# Clean missing values
df = df.replace("?", np.nan).dropna()
df = df.apply(pd.to_numeric, errors='coerce').dropna()

X_tab = df.drop(columns=['PCOS (Y/N)'])
y_tab = df['PCOS (Y/N)'].astype(int).values

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_tab_scaled = scaler.fit_transform(X_tab)

# Now align both datasets to the minimum length
min_len = min(len(X_img), len(X_tab_scaled))
X_img, y_img = X_img[:min_len], y_img[:min_len]
X_tab_scaled, y_tab = X_tab_scaled[:min_len], y_tab[:min_len]


In [6]:
X_img_train, X_img_test, y_img_train, y_img_test = train_test_split(X_img, y_img, test_size=0.2, random_state=42)

cnn_model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(128, 128, 3)),
    MaxPooling2D(),
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D(),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn_model.fit(X_img_train, y_img_train, epochs=10, batch_size=32, validation_split=0.1)

# Predict probabilities
cnn_preds = cnn_model.predict(X_img_test).flatten()



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [7]:
# -------------------- STEP 4: TRAIN XGBOOST FOR HORMONE DATA --------------------
X_tab_train, X_tab_test, y_tab_train, y_tab_test = train_test_split(X_tab_scaled, y_tab, test_size=0.2, random_state=42)

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_tab_train, y_tab_train)

xgb_preds = xgb_model.predict_proba(X_tab_test)[:, 1]

# -------------------- STEP 5: LATE FUSION (Weighted Average) --------------------
final_preds = (0.5 * cnn_preds) + (0.5 * xgb_preds)
final_labels = (final_preds > 0.5).astype(int)

# -------------------- EVALUATION --------------------
acc = accuracy_score(y_tab_test, final_labels)
print("Late Fusion Accuracy:", acc)

Late Fusion Accuracy: 0.5035971223021583


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [11]:
# Save models in Colab
cnn_model.save('cnn_model_late.h5')

import pickle
with open('xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

with open('xgb_feature_names.pkl', 'wb') as f:
    pickle.dump(X_tab.columns.tolist(), f)
    

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)


In [12]:
import pandas as pd

# Fill in values for one patient — make sure to provide ALL 43 feature values in correct order
data = {
    'Age (yrs)': 28,
    'Weight (Kg)': 44.6,
    'Height(Cm)': 152,
    'BMI': 19.3,
    'Pulse rate(bpm) ': 78,
    'RR (breaths/min)': 22,
    'Hb(g/dl)': 10.48,
    'Cycle(R/I)': 'R',                # Again, encode as used during training
    'Cycle length(days)': 5,
    'Marraige Status (Yrs)': 7,
    'Pregnant(Y/N)': 0,
    'No. of abortions': 0,
    '  I   beta-HCG(mIU/mL)': 1.99,
    'II    beta-HCG(mIU/mL)': 1.99,
    'FSH(mIU/mL)': 7.95,
    'LH(mIU/mL)': 3.68,
    'FSH/LH': 2.16,
    'Hip(inch)': 36,
    'Waist(inch)': 30,
    'Waist:Hip Ratio': 0.83,
    'TSH (mIU/L)': 0.68,
    'AMH(ng/mL)': 2.07,
    'PRL(ng/mL)': 45.16,
    'Vit D3 (ng/mL)': 17.1,
    'PRG(ng/mL)': 0.57,
    'RBS(mg/dl)': 92,
    'Weight gain(Y/N)': 0,
    'hair growth(Y/N)': 0,
    'Skin darkening (Y/N)': 0,
    'Hair loss(Y/N)': 0,
    'Pimples(Y/N)': 0,
    'Fast food (Y/N)': 1,
    'Reg.Exercise(Y/N)': 0,
    'BP _Systolic (mmHg)': 110,
    'BP _Diastolic (mmHg)': 80,
    'Follicle No. (L)': 3,
    'Follicle No. (R)': 3,
    'Avg. F size (L) (mm)': 18,
    'Avg. F size (R) (mm)': 18,
    'Endometrium (mm)': 8.5,
    'PCOS (Y/N)': 0
}

# Create DataFrame
df = pd.DataFrame([data])

# Save to CSV
df.to_csv("hormone_sample_xgb.csv", index=False)