In [12]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pylab as plt
from sklearn.preprocessing import StandardScaler,MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from scipy.stats import norm
from scipy import stats
%matplotlib inline

# **Prediction of hypertension disease using TabPfn model**

# dataset Preparation and Exploration

In [13]:
demog = pd.read_csv("/Users/tony/Documents/research_projects/TabPFN/NHES dataset/demographic.csv")
quest = pd.read_csv("/Users/tony/Documents/research_projects/TabPFN/NHES dataset/questionnaire.csv")
exam = pd.read_csv("/Users/tony/Documents/research_projects/TabPFN/NHES dataset/Examination.csv")
labs = pd.read_csv("/Users/tony/Documents/research_projects/TabPFN/NHES dataset/labs.csv")

In [14]:
#merging the dataset
data = pd.merge(demog,quest,on="SEQN")
data = pd.merge(data,exam,on="SEQN")
data = pd.merge(data,labs,on='SEQN')
print(data.head())

    SEQN  SDDSRVYR  RIDSTATR  RIAGENDR  RIDAGEYR  RIDAGEMN  RIDRETH1  \
0  73557         8         2         1        69       NaN         4   
1  73558         8         2         1        54       NaN         3   
2  73559         8         2         1        72       NaN         3   
3  73560         8         2         1         9       NaN         3   
4  73561         8         2         2        73       NaN         3   

   RIDRETH3  RIDEXMON  RIDEXAGM  ...  URXUTL  URDUTLLC  URXUTU  URDUTULC  \
0         4       1.0       NaN  ...     NaN       NaN     NaN       NaN   
1         3       1.0       NaN  ...     NaN       NaN     NaN       NaN   
2         3       2.0       NaN  ...     NaN       NaN     NaN       NaN   
3         3       1.0     119.0  ...   0.062       0.0   0.238       0.0   
4         3       1.0       NaN  ...     NaN       NaN     NaN       NaN   

   URXUUR  URDUURLC  URXPREG  URXUAS  LBDB12  LBDB12SI  
0     NaN       NaN      NaN     NaN   524.0     386.

In [15]:
pd.isnull(data['SEQN'].unique().sum())

False

In [16]:
data['LBXTC'].unique()

array([167., 170., 126., 168., 201., 226.,  nan, 278., 173., 148., 131.,
       154., 182., 135., 225., 202., 196., 198., 159., 166., 192., 185.,
       189., 165., 151., 177., 150., 179., 227., 203., 161., 133., 200.,
       141., 256., 171., 164., 238., 162., 186., 172., 274., 175., 204.,
       195., 140., 210., 191., 271., 123., 146., 127., 193., 181., 206.,
       176., 241., 142., 223., 188., 125., 187., 160., 243., 272., 184.,
       230., 254., 174., 276., 255., 208., 214., 109., 149., 132., 213.,
       155., 144., 120., 199., 152., 130., 209.,  98., 239., 234., 143.,
       222., 246., 153., 121., 233., 183., 190., 261., 147., 106., 118.,
       129., 220., 178., 250., 163., 157., 117., 245., 158., 122., 169.,
       240., 145., 253., 237., 236., 180., 207., 216., 103., 138.,  90.,
       205., 215., 116., 258., 219., 211., 156., 108., 113., 128., 247.,
       137., 224., 134., 194., 249., 267., 235., 279., 248., 266., 301.,
       139., 232., 252., 212., 251., 218., 197., 27

In [17]:
data.isnull().sum()

SEQN           0
SDDSRVYR       0
RIDSTATR       0
RIAGENDR       0
RIDAGEYR       0
            ... 
URDUURLC    7149
URXPREG     8552
URXUAS      7151
LBDB12      4497
LBDB12SI    4497
Length: 1645, dtype: int64

In [18]:
sel_data = data[[
    'SEQN',
    'RIDAGEYR',
    'RIAGENDR',
    'RIDRETH3',
    'SMQ020',
    'ALQ130',
    'BMXBMI',
    'LBXTC',
    'DIQ010',
    'BPXSY1',
    'BPXDI1'
]]

In [19]:
#rename the selected account
sel_data = sel_data.rename(columns={
    
    'RIDAGEYR':'Age',
    'RIAGENDR':'Gender',
    'RIDRETH3':'Race',
    'SMQ020':'Smoking',
    'ALQ130':'Alcohol',
    'BMXBMI':'BMI',
    'LBXTC':'Cholesterol',
    'DIQ010':'Diabetes',
    'BPXSY1':'Systolic_BP',
    'BPXDI1':'Diastolic_BP'
})

In [23]:
#selected_data.isnull().sum()

In [24]:
sel_data.loc[:,'Hypertension'] = (
    (sel_data['Systolic_BP']>=140)|(sel_data['Diastolic_BP']>=90).astype(int)
)

#save the dataset
sel_data.to_csv('Hypertension1_data.csv',index=False)

In [25]:
#selected_data.head()
df = pd.read_csv('/Users/tony/Documents/research_projects/TabPFN/NHES dataset/model/Hypertension1_data.csv')
df.head()

Unnamed: 0,SEQN,Age,Gender,Race,Smoking,Alcohol,BMI,Cholesterol,Diabetes,Systolic_BP,Diastolic_BP,Hypertension
0,73557,69,1,4,1.0,1.0,26.7,167.0,1.0,122.0,72.0,False
1,73558,54,1,3,1.0,4.0,28.6,170.0,1.0,156.0,62.0,True
2,73559,72,1,3,1.0,,28.9,126.0,1.0,140.0,90.0,True
3,73560,9,1,3,,,17.1,168.0,2.0,108.0,38.0,False
4,73561,73,2,3,2.0,,19.7,201.0,2.0,136.0,86.0,False


In [26]:
#prepare the dataset
from sklearn.model_selection import train_test_split

features = df[['Age','Gender','Race','Smoking','Alcohol','BMI','Cholesterol','Diabetes']]
target = df['Hypertension']

X_train,X_test,y_train,y_test = train_test_split(features,target,test_size = 0.3,random_state = 42)

In [27]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Define feature names
num_features = ['Age', 'BMI', 'Cholesterol']
cat_features = ['Smoking', 'Diabetes']

# Impute numerical features
num_imputer = SimpleImputer(strategy='median')
X_train_num = pd.DataFrame(num_imputer.fit_transform(X_train[num_features]), columns=num_features)
X_test_num = pd.DataFrame(num_imputer.transform(X_test[num_features]), columns=num_features)

# Impute categorical features
cat_imputer = SimpleImputer(strategy='most_frequent')
X_train_cat = pd.DataFrame(cat_imputer.fit_transform(X_train[cat_features]), columns=cat_features)
X_test_cat = pd.DataFrame(cat_imputer.transform(X_test[cat_features]), columns=cat_features)

# Concatenate numerical and categorical features
X_train_imputed = pd.concat([X_train_num, X_train_cat], axis=1)
X_test_imputed = pd.concat([X_test_num, X_test_cat], axis=1)


# Now, you can safely create new features
X_train_imputed['Age_BMI'] = X_train_imputed['Age'] * X_train_imputed['BMI']
X_train_imputed['Cholesterol_BMI'] = X_train_imputed['Cholesterol'] * X_train_imputed['BMI']

X_test_imputed['Age_BMI'] = X_test_imputed['Age'] * X_test_imputed['BMI']
X_test_imputed['Cholesterol_BMI'] = X_test_imputed['Cholesterol'] * X_test_imputed['BMI']


In [28]:
X_train_imputed['Age_BMI'] = X_train_imputed['Age']*X_train_imputed["BMI"]
X_train_imputed["Cholesterol_BMI"] = X_train_imputed["Cholesterol"]*X_train_imputed['BMI']

X_test_imputed['Age_BMI'] = X_test_imputed['Age']*X_test_imputed["BMI"]
X_test_imputed["Cholesterol_BMI"] = X_test_imputed["Cholesterol"]*X_test_imputed['BMI']

In [29]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_imputed)
X_test_poly = poly.transform(X_test_imputed)


In [40]:
def create_domain_features(df):
    """Add Hypertension Risk, Pulse Pressure, etc."""
    # Hypertension Risk Score
    df['Hypertension_Risk'] = 0.3*df['Age'] + 0.5*df['BMI'] + 0.2*df['Cholesterol']
    
    # Pulse Pressure
    if 'Systolic_BP' in df.columns and 'Diastolic_BP' in df.columns:
        df['Pulse_Pressure'] = df['Systolic_BP'] - df['Diastolic_BP']
    
    # Sodium-to-Potassium Ratio (if columns exist)
    if 'DR1TSODI' in df.columns and 'DR1TPOTA' in df.columns:
        df['Na_K_Ratio'] = df['DR1TSODI'] / (df['DR1TPOTA'] + 1e-8)  # Avoid division by zero
    
    return df

# Apply to both splits
X_train_domain = create_domain_features(X_train_imputed)
X_test_domain = create_domain_features(X_test_imputed)

In [None]:
#scaled dataset
x_train_scaled = scaler.fit_transform(X_train_domain)
x_test_scaled = scaler.transform(X_test_domain)

In [41]:
#correlation of tha dataset
corr = X_train.corrwith(y_train)
print(corr)

high_train_corrF = ['Age','BMI','Cholesterol']
X_trainC = X_train[high_train_corrF]
X_testC = X_test[high_train_corrF]

Age            0.396685
Gender        -0.029222
Race           0.013840
Smoking       -0.039448
Alcohol        0.002067
BMI            0.175629
Cholesterol    0.158048
Diabetes      -0.097621
dtype: float64


# *Balancing the dataset using SMOTE*

In [None]:
from imblearn.over_sampling import SMOTE,ADASYN
from imblearn.under_sampling import RandomUnderSampler

#undersamp = RandomUnderSampler(sampling_strategy='majority',random_state=42)

smote = SMOTE()
ada = ADASYN(random_state=42)
X_trainR,y_trainR = smote.fit_resample(X_train_domain,y_train)
#X_trainR,y_trainR = undersamp.fit_resample(X_train_imputed,y_train)

# *Training start here*

In [None]:
from tabpfn import TabPFNClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

# *Train using TabPFN*

In [47]:
#Train the TabPFN
tabpfn = TabPFNClassifier(device='cpu')

tabpfn.fit(x_train_scaled,y_train)
predictions = tabpfn.predict(x_test_scaled)
print("Accuracy",accuracy_score(y_test,predictions))

Accuracy 0.9086277173913043


In [None]:

from sklearn.model_selection import cross_val_score

scores = cross_val_score(tabpfn,X_train,y_train,cv=5,scoring='accuracy')
print("CV",scores.mean())

In [None]:

import shap

X = np.array(X_test)
explainer = shap.Explainer(tabpfn.predict_proba(X_test),X_trainR)
shap_value = explainer(X_trainR)
shap.summary_plot(shap_value,X_train)

# *Train using logistic regression*


In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# x_train_scaled = scaler.fit_transform(X_train_domain)
# x_test_scaled = scaler.transform(X_test_domain)
lr = LogisticRegression()
lr.fit(X_train_domain,y_train)

y_pred = lr.predict(X_test_domain)
print("Accuracy",accuracy_score(y_test,y_pred))

Accuracy 0.9035326086956522


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [49]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(
    objectives='binary:logistic',
    eval_metrics = 'logloss',
    use_label_encoder=False,
    n_estimators=100,
    learning_rate = 0.1,
    max_depth=5,
    random_state=42
)

xgb_model.fit(x_train_scaled,y_train)

#predict class labels
y_pred = xgb_model.predict(x_test_scaled)
print("Accuracy:",accuracy_score(y_test,y_pred))

Accuracy: 0.9042119565217391


Parameters: { "eval_metrics", "objectives", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [38]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_max = scaler.fit_transform(X_train_imputed)
X_test_max = scaler.transform(X_test_imputed)


In [None]:
#explanations of a dataset
import shap

explainer = shap.Explainer(xgb_model)
shap_values = explainer(x_train_scaled)

shap.summary_plot(shap_values,x_train_scaled)


In [57]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,confusion_matrix

accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
specificity = confusion_matrix(y_test, predictions)[0, 0] / (confusion_matrix(y_test, predictions)[0, 0] + confusion_matrix(y_test, predictions)[0, 1])

print(accuracy)
print(precision)
print(recall)

0.9086277173913043
0.0
0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# *Comparison of the models*

In [None]:
from sklearn.metrics import accuracy_score
from mlxtend.evaluate import paired_ttest

# Get predictions from all models
y_pred_tabpfn = predictions
y_pred_xgb = xgb_model.predict(X_test)
y_pred_lr = lr_model.predict(X_test)

# Compare TabPFN vs. XGBoost
t_stat, p_value = paired_ttest(
    y_pred_tabpfn, y_pred_xgb, y_test,
    method="auto"
)
print(f"TabPFN vs. XGBoost p-value: {p_value}")