In [None]:
!pip install pandas scikit-learn matplotlib seaborn joblib


In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
!pip install kaggle
from google.colab import files

# Upload kaggle.json (your API key file)
files.upload()

# Make directory for Kaggle API
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download dataset
!kaggle datasets download -d joniarroba/noshowappointments
!unzip noshowappointments.zip


In [None]:
import pandas as pd

df = pd.read_csv("KaggleV2-May-2016.csv")

# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace("-", "_").str.replace(" ", "_")

# Parse dates
df['scheduledday'] = pd.to_datetime(df['scheduledday'])
df['appointmentday'] = pd.to_datetime(df['appointmentday'])

# Target encoding
df['no_show'] = (df['no_show'] == 'Yes').astype(int)

# Drop invalid ages
df = df[(df['age'] >= 0) & (df['age'] <= 115)]

# Feature engineering
df['wait_days'] = (df['appointmentday'] - df['scheduledday']).dt.days.clip(lower=0)
df['appt_weekday'] = df['appointmentday'].dt.day_name()
df['is_weekend'] = df['appointmentday'].dt.weekday >= 5
df['age_band'] = pd.cut(df['age'], bins=[0,12,19,59,120],
                        labels=['child','teen','adult','senior'], include_lowest=True)

df.head()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# SMS effect
sns.barplot(x='sms_received', y='no_show', data=df)
plt.title("No-Show Rate by SMS Received")
plt.show()

# Weekday effect
sns.barplot(x='appt_weekday', y='no_show', data=df, order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
plt.title("No-Show Rate by Weekday")
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

features = ['gender','age','age_band','neighbourhood','scholarship','hipertension',
            'diabetes','alcoholism','handcap','sms_received','wait_days','appt_weekday','is_weekend']
X = df[features]
y = df['no_show']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

num_cols = ['age','wait_days']
cat_cols = [col for col in features if col not in num_cols]

pre = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num', 'passthrough', num_cols)
])

tree = DecisionTreeClassifier(max_depth=6, min_samples_leaf=50, class_weight='balanced', random_state=42)
pipe = Pipeline([('pre', pre), ('clf', tree)])
pipe.fit(X_train, y_train)

pred = pipe.predict(X_test)
proba = pipe.predict_proba(X_test)[:,1]

print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred, digits=3))
print("ROC-AUC:", roc_auc_score(y_test, proba))


In [None]:
import joblib

# Save model
joblib.dump(pipe, "tree_model.pkl")

# Save scored dataset
results = X_test.copy()
results['actual_no_show'] = y_test.values
results['predicted_no_show'] = pred
results['predicted_proba'] = proba
results.to_csv("predictions.csv", index=False)


In [None]:
from google.colab import files
files.download("tree_model.pkl")
files.download("predictions.csv")


In [None]:
files.download("predictions.csv")