In [None]:
import pandas as pd
import numpy as np
import dotenv
import os
from pathlib import Path
from ktools.utils.find_kaggle_datasets import find_competition_info

In [None]:
dotenv.load_dotenv()
data_dir = Path(os.getenv("DATA_DIR"))
train_csv_path, test_csv_path, sample_sub_csv_path, target_col_name = find_competition_info(data_dir / "diabetes_prediction")
train_df = pd.read_csv(train_csv_path, index_col=0)
test_df = pd.read_csv(test_csv_path, index_col=0)

In [None]:
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score


def adversarial_validation(train: pd.DataFrame, test: pd.DataFrame):

    train['is_train'] = 1
    test['is_train'] = 0

    combined = pd.concat([train, test], ignore_index=True)

    X = combined.drop(columns=['is_train'])
    X = X.select_dtypes(include=['category', 'object']).astype('category')
    y = combined['is_train']

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    predictions = np.empty(len(y))

    for train_index, val_index in skf.split(X, y):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        model = XGBClassifier(enable_categorical=True, random_state=42)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)

        auc = roc_auc_score(y_val, y_pred)
        print(f'Fold AUC: {auc:.4f}')

        predictions[val_index] = y_pred

    return predictions, y

In [None]:
from matplotlib import pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

def av(X1, X2):
    X = pd.concat([X1, X2])
    X = X.astype({c: 'category' for c in X.columns if X[c].dtype=='object'})
    y = np.array([1]*len(X1)+[0]*len(X2))
    return cross_val_score(
        XGBClassifier(
            enable_categorical=True,
            n_jobs=4, random_state=0
        ), X, y, n_jobs=1,
        cv=StratifiedKFold(5, shuffle=True, random_state=0),
        scoring='roc_auc'  
    ).mean()

# train = pd.read_csv('/kaggle/input/playground-series-s5e12/train.csv', index_col='id')
# test = pd.read_csv('/kaggle/input/playground-series-s5e12/test.csv', index_col='id')
train_df.pop('diagnosed_diabetes')

start = np.array([0, 0.2])
scores = []
for r in start:
    scores.append(av(train_df.iloc[int(r*len(train_df)):], test_df))

plt.subplots(figsize=(9,3))
plt.plot(start, scores, '.-')
plt.xlabel(r'more samples $\leftarrow\quad$ starting row $\quad\rightarrow$ less samples')
plt.title('Adversarial Validation AUC')
plt.show()

In [None]:
combined = pd.concat([train_df, test_df])
# combined.drop(columns=target_col_name, inplace=True)
# X = combined.drop(columns=['is_train'])
X = combined.astype({c: 'category' for c in combined.columns if combined[c].dtype=='object'})
y = pd.Series([1]*len(train_df)+[0]*len(test_df))

In [None]:
from sklearn.base import BaseEstimator


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
predictions = np.empty(len(y))

for train_index, val_index in skf.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    model: BaseEstimator = XGBClassifier(enable_categorical=True, random_state=0, scale_pos_weight=3/7)
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:,1]

    auc = roc_auc_score(y_val, y_pred)
    print(f'Fold AUC: {auc:.4f}')

    predictions[val_index] = y_pred

In [None]:
deviation = np.abs(y - predictions)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.hist(deviation, bins=50, color='blue', alpha=0.7)
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

actual = y
predicted = (predictions >= 0.5).astype(int)

# Generate confusion matrix
cm = confusion_matrix(actual, predicted)

# Display the confusion matrix
fig, ax = plt.subplots(figsize=(8, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, 
                               display_labels=np.unique(actual))
disp.plot(cmap='Blues', ax=ax, values_format='d')
plt.title('Confusion Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Print metrics
print("Confusion Matrix:")
print(cm)
print(f"\nAccuracy: {np.trace(cm) / np.sum(cm):.2%}")
print(f"Total samples: {np.sum(cm)}")

In [None]:
mask = (predicted == 0) & (actual == 1)

In [None]:
most_similar_idcs_to_test = np.arange(len(y))[mask]

In [None]:
most_similar_idcs_to_test

In [None]:
plt.figure(figsize=(10, 6))
values, bins, _ = plt.hist(most_similar_idcs_to_test[most_similar_idcs_to_test > 660000], bins=300, color='blue', alpha=0.7)
plt.yscale('log')
plt.show()  

In [None]:
values

In [None]:
bins

In [None]:
bins[:-1][values == 52]

In [None]:
target_col = 'physical_activity_minutes_per_week'
window_size = 100
rolling_mean = train_df[target_col].rolling(window=window_size).mean()

In [None]:
plt.figure(figsize=(10, 4))
plt.plot(train_df.index, rolling_mean, label='Rolling Mean', color='tab:blue')
plt.legend()
plt.show()