In [2]:
import pandas as pd

df = pd.read_csv("../data/raw/noshowappointments.csv")

# Repeat key cleaning & feature steps so this notebook is self-contained
df = df[df['Age'] >= 0]

df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'])
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'])

# ✅ Simple, safe way to get days between scheduling and appointment
df['WaitingDays'] = (df['AppointmentDay'] - df['ScheduledDay']).dt.days

# Day of week for appointment
df['ApptWeekday'] = df['AppointmentDay'].dt.day_name()

# Binary target: 1 = no-show, 0 = showed
df['NoShow'] = (df['No-show'] == "Yes").astype(int)

df.head()


Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,WaitingDays,ApptWeekday,NoShow
0,29872500000000.0,5642903,F,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,62,JARDIM DA PENHA,0,1,0,0,0,0,No,-1,Friday,0
1,558997800000000.0,5642503,M,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,0,0,0,0,0,No,-1,Friday,0
2,4262962000000.0,5642549,F,2016-04-29 16:19:04+00:00,2016-04-29 00:00:00+00:00,62,MATA DA PRAIA,0,0,0,0,0,0,No,-1,Friday,0
3,867951200000.0,5642828,F,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No,-1,Friday,0
4,8841186000000.0,5642494,F,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,1,1,0,0,0,No,-1,Friday,0


In [3]:
from sklearn.model_selection import train_test_split

feature_cols = [
    'Age', 'WaitingDays',
    'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received',
    'Gender', 'ApptWeekday'
]

X = df[feature_cols]
y = df['NoShow']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape


((88420, 10), (22106, 10))

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

numeric_features = ['Age', 'WaitingDays']
binary_features = ['Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received']
categorical_features = ['Gender', 'ApptWeekday']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

binary_transformer = 'passthrough'

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('bin', binary_transformer, binary_features),
        ('cat', categorical_transformer, categorical_features),
    ]
)

clf = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

clf


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('bin', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [5]:
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))


              precision    recall  f1-score   support

           0       0.86      0.69      0.77     17642
           1       0.32      0.57      0.41      4464

    accuracy                           0.67     22106
   macro avg       0.59      0.63      0.59     22106
weighted avg       0.75      0.67      0.70     22106

ROC AUC: 0.6689204982489245


In [6]:
import joblib
from pathlib import Path

models_dir = Path("../models")
models_dir.mkdir(exist_ok=True)

joblib.dump(clf, models_dir / "no_show_model.joblib")


['../models/no_show_model.joblib']