In [None]:
import os 
import torch 
import numpy as np 
import pandas as pd

from math import ceil
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from mmpfn.models.tabpfn_v2 import TabPFNClassifier

pca_dim = 128
col_features = [
"Age", "Breed1", "Breed2", "Color1", 'Color2', 'Color3', 'Dewormed', 'Fee', 'FurLength', 'Gender', 'Health', 'MaturitySize', 'PhotoAmt', 'State', 'Sterilized', 'Type', 'Vaccinated', 'VideoAmt', 'Quantity'
] + list(range(pca_dim))
col_exclude = [
'PetID', 'RescureID', 'Description', 'Name'
]
col_target = 'AdoptionSpeed'

cat_features = [
"Breed1", "Breed2", "Color1", 'Color2', 'Color3', 'Dewormed', 'FurLength', 'Gender', 'Health', 'MaturitySize', 'State', 'Sterilized', 'Type', 'Vaccinated'
]
cat_features_index = [col_features.index(feature) for feature in cat_features]

train = pd.read_csv("datasets/petfinder-adoption-prediction/train/train.csv")
datasets_dir = "datasets/petfinder-adoption-prediction"

train["PetID"] = train["PetID"].astype(str)
train_images = [
    f
    for f in os.listdir(os.path.join(datasets_dir, "train_images"))
    if f.endswith(".jpg")
]
train_images = [f for f in train_images if f.split("-")[0] in train["PetID"].values]
train_images_df = pd.DataFrame(
    {
        "PetID": [f.split("-")[0] for f in train_images],
        "ImageNumber": [f.split("-")[1].split(".")[0] for f in train_images],
    }
)
train_images_df = train_images_df[train_images_df["ImageNumber"] == "1"]
train = train.merge(train_images_df, on="PetID", how="left")
train = train[train["ImageNumber"].notna()]
train["ImagePath"] = train["PetID"] + "-1.jpg"

file_path = 'train_image_features.npy'
try:
  image_features = np.load(file_path, allow_pickle=True)
  image_features = image_features[:, 0, :]
except Exception as e:
  print(e)

pca = PCA(n_components=pca_dim)
image_features_pca = pca.fit_transform(image_features)
train = train.reset_index(drop=True)
train = pd.concat([train, pd.DataFrame(image_features_pca)], axis=1)

X_train, X_test, y_train, y_test = train_test_split(train[col_features], train[col_target], test_size=0.2)

for col, any_na in X_train.isna().any().items():
  if any_na:
    X_train[col] = X_train[col].fillna(0)

X_train_ = torch.tensor(X_train.to_numpy())
y_train_ = torch.tensor(y_train.to_numpy())

model = TabPFNClassifier(
	random_state = 0,
	device = 'cuda',
	ignore_pretraining_limits=True,
	categorical_features_indices = cat_features_index,
	model_path = f"{Path().absolute()}/parameters/tabpfn-v2-classifier.ckpt",
).fit(X_train_, y_train_)

y_pred = model.predict(X_test_)

accuracy = accuracy_score(y_test, y_pred)
print("accuracy:", accuracy)