In [None]:
import os
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet18, ResNet18_Weights
from PIL import Image
import numpy as np
import pandas as pd


In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Load pretrained ResNet18 & remove classification head
model = resnet18(weights=ResNet18_Weights.DEFAULT).to(device)
model = torch.nn.Sequential(*list(model.children())[:-1])  # Remove last layer
model.eval()


In [None]:


# Set device 
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Load pretrained ResNet18 & remove classification head
model = resnet18(weights=ResNet18_Weights.DEFAULT).to(device)
model = torch.nn.Sequential(*list(model.children())[:-1])  # Remove last layer
model.eval()

# Define transformation (ResNet format)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

#  extract image features
def extract_features(image_path):
    img = Image.open(image_path).convert("RGB")
    img = transform(img).unsqueeze(0).to(device)  # Convert to tensor
    with torch.no_grad():
        features = model(img).squeeze().cpu().numpy()  # Extract features
    return features

# Process all images
image_dir = "/Users/tony/Documents/research_projects/rest_net_tab/ocular/tab_rest/processed_images"
image_features = {}

for img_name in os.listdir(image_dir):
    img_path = os.path.join(image_dir, img_name)
    if img_name.lower().endswith(('.png', '.jpg', '.jpeg')):
        image_features[img_name] = extract_features(img_path)

# Convert to DataFrame & save for future use
image_features_df = pd.DataFrame.from_dict(image_features, orient='index')
image_features_df.index.name = 'image_id'
image_features_df.to_csv("/Users/tony/Documents/research_projects/rest_net_tab/ocular/tab_rest/image_feature.csv")

print(" Image feature extraction complete! Saved as 'image_features.csv'.")


In [None]:
#Load the metadata and merge with Images features
import pandas as pd
# Load metadata CSV
metadata = pd.read_csv('/Users/tony/Documents/research_projects/rest_net_tab/ocular/tab_rest/full_df.csv')

# Extract filenames from both Left and Right Fundus columns
metadata = metadata.melt(
    id_vars=["ID", "Patient Age", "Patient Sex", "Left-Diagnostic Keywords", "Right-Diagnostic Keywords",
             "N", "D", "G", "C", "A", "H", "M", "O", "filepath", "labels", "target", "filename"],
    value_vars=["Left-Fundus", "Right-Fundus"],
    var_name="Eye",
    value_name="image_id"
)

# Ensure filenames match those in image_features.csv
metadata["image_id"] = metadata["image_id"].astype(str)

# Load extracted image features
image_features = pd.read_csv("/Users/tony/Documents/research_projects/rest_net_tab/ocular/tab_rest/image_feature.csv")


# Merge datasets based on image_id
df = metadata.merge(image_features, on="image_id", how="left")  # Use 'left' join to keep all metadata rows

# Drop unnecessary columns if needed
df.drop(columns=["filepath"], inplace=True)

# Display merged dataset
df.head()


In [None]:
df.dropna(how = 'all',inplace=True)
df.head()

In [None]:
missing_feature_rows = df[df.iloc[:, -512:].isnull().any(axis=1)]
print("Rows with missing features:", len(missing_feature_rows))


In [None]:
missing_images = missing_feature_rows["image_id"].unique()
print("Missing images:", missing_images)


In [None]:
num = df.select_dtypes(include=['number']).mean()
proc_data = df.fillna(num,inplace=True)

In [None]:
print(df.columns)

In [None]:
# df[''].unique()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from tabpfn import TabPFNClassifier


# Features: Metadata + Image features
feature_columns = ['Patient Age', 'Patient Sex'] + [str(i) for i in range(512)]  # Adjust for your ResNet features
X = df[feature_columns]

# Encode 'Patient Sex' (e.g., M=0, F=1)
X['Patient Sex'] = X['Patient Sex'].map({'M': 0, 'F': 1})

# Targets: All diagnostic labels
target_columns = ['N', 'D', 'G', 'C', 'A', 'H', 'M', 'O']
y = df[target_columns]



In [None]:
dfa = df.drop(columns=['ID', 'Left-Diagnostic Keywords', 'Right-Diagnostic Keywords', 
                  'filename', 'labels'])

In [None]:
#encoding of categorical dataset
dfa['Patient Sex'] = df['Patient Sex'].map({'Male':0,'Female':1})

In [None]:
dfa['target'] = df['target'].apply(lambda x: np.argmax(eval(x))) 

In [None]:
dfa.select_dtypes(include=['object']).columns


In [None]:
#remove of columns which are not going to be used in training
dfa.drop(columns = ['N','D','G','C','A','H','M','O','Eye','image_id'])

In [None]:
dfa.drop(columns=['Eye','image_id'],inplace=True)

# further processing of dataset

In [None]:
from sklearn.decomposition import PCA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

features = dfa.drop(columns=['target','Patient Age','Patient Sex'])
scaler = StandardScaler()
scaled_features =scaler.fit_transform(features)

scaled_df = pd.DataFrame(scaled_features,columns=features.columns)

In [None]:
# Apply PCA to reduce the dataset to 100 principal components
pca = PCA(n_components=98)  # Adjust based on the dataset
X_pca = pca.fit_transform(scaled_df)

# Convert PCA output to DataFrame
X_pca_df = pd.DataFrame(X_pca, columns=[f"PCA_{i}" for i in range(98)])

# Check how much variance is retained
explained_variance = np.sum(pca.explained_variance_ratio_)
print(f"Explained Variance: {explained_variance:.4f}")  # Should be high (e.g., >90%)


In [None]:
# Add the target column back
X_pca_df["target"] = dfa["target"]
X_pca_df["Patient Age"] = dfa["Patient Age"]
X_pca_df["Patient Sex"] = dfa["Patient Sex"]
dfe = X_pca_df

In [None]:
# from imblearn.over_sampling import SMOTE
# smote = SMOTE(sampling_strategy="auto",random_state=42)
# x_train_s,y_train_s = smote.fit_resample(X_train,y_train)

In [None]:

from tabpfn import TabPFNClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

X = dfe.drop(columns=["target"])
y = dfe["target"]

#train test split
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=1000,stratify = y,random_state=42)



In [None]:

# #Train Tabpfn model
tabr = TabPFNClassifier(device="cpu",ignore_pretraining_limits=True)
tabr.fit(X_train,y_train)


In [None]:
#predictions
pred = tabr.predict(X_test)
accuracy = accuracy_score(y_test,pred)
print("Accuracy",accuracy)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

ConfusionMatrixDisplay.from_predictions(y_test, pred)
plt.title("Hybrid Model Confusion Matrix")
plt.show()

In [None]:
# Train
classifier = MultiOutputClassifier(TabPFNClassifier(device='cpu'))
classifier.fit(X_train_reduced, y_train_reduced)

# Evaluate (example: F1-score per class)
from sklearn.metrics import f1_score
y_pred = classifier.predict(X_test)
for i, col in enumerate(target_columns):
    print(f"F1 for {col}: {f1_score(y_test.iloc[:, i], y_pred[:, i])}")