In [None]:

!pip install -q scikit-learn


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder




In [None]:


from google.colab import files
uploaded = files.upload()
fname = next((k for k in uploaded.keys() if k.lower().endswith('.csv')), None)
if fname is None:
    raise FileNotFoundError("No CSV uploaded.")
df = pd.read_csv(fname)
print("Loaded:", fname, "shape:", df.shape)
df.head()



Saving KaggleV2-May-2016.csv to KaggleV2-May-2016.csv
Loaded: KaggleV2-May-2016.csv shape: (110527, 14)


Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [None]:

df.columns = (
    df.columns
      .astype(str)
      .str.strip()
      .str.replace(r'[-\s]+', '_', regex=True)
      .str.replace(r'[^0-9a-zA-Z_]', '', regex=True)
      .str.lower()
)
print("Columns:", df.columns.tolist())


Columns: ['patientid', 'appointmentid', 'gender', 'scheduledday', 'appointmentday', 'age', 'neighbourhood', 'scholarship', 'hipertension', 'diabetes', 'alcoholism', 'handcap', 'sms_received', 'no_show']


In [None]:
print("Shape:", df.shape)
print("\nMissing values per column:\n", df.isnull().sum())
print("\nDuplicate rows:", df.duplicated().sum())
print("\nColumn dtypes:\n", df.dtypes)


Shape: (110527, 14)

Missing values per column:
 patientid         0
appointmentid     0
gender            0
scheduledday      0
appointmentday    0
age               0
neighbourhood     0
scholarship       0
hipertension      0
diabetes          0
alcoholism        0
handcap           0
sms_received      0
no_show           0
dtype: int64

Duplicate rows: 0

Column dtypes:
 patientid         float64
appointmentid       int64
gender             object
scheduledday       object
appointmentday     object
age                 int64
neighbourhood      object
scholarship         int64
hipertension        int64
diabetes            int64
alcoholism          int64
handcap             int64
sms_received        int64
no_show            object
dtype: object


In [None]:
df = df.drop_duplicates().reset_index(drop=True)
print("After dropping duplicates shape:", df.shape)


After dropping duplicates shape: (110527, 14)


In [None]:

for col in ("scheduledday", "appointmentday"):
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")

date_cols = [c for c in ("scheduledday", "appointmentday") if c in df.columns]
if date_cols:
    before = df.shape[0]
    df = df.dropna(subset=date_cols).reset_index(drop=True)
    print(f"Dropped {before - df.shape[0]} rows due to invalid/missing dates. New shape: {df.shape}")
else:
    print("Date columns not found. Please check column names.")


Dropped 0 rows due to invalid/missing dates. New shape: (110527, 14)


In [None]:

df["waiting_days"] = (df["appointmentday"].dt.normalize() - df["scheduledday"].dt.normalize()).dt.days

neg_count = int((df["waiting_days"] < 0).sum())
print("Rows with negative waiting_days:", neg_count)

df = df[df["waiting_days"] >= 0].reset_index(drop=True)
print("After removing negative waits shape:", df.shape)


Rows with negative waiting_days: 5
After removing negative waits shape: (110522, 15)


In [None]:
if "age" in df.columns:
    before = df.shape[0]
    df = df[df["age"].notna()]
    df = df[df["age"] >= 0]
    df = df[df["age"] <= 100]
    after = df.shape[0]
    print(f"Removed {before-after} rows due to invalid ages. New shape: {df.shape}")
else:
    print("No 'age' column found.")


Removed 8 rows due to invalid ages. New shape: (110514, 15)


In [None]:

tcols = [c for c in df.columns if ('no' in c and 'show' in c)]
if not tcols:

    tcols = [c for c in df.columns if 'noshow' in c.replace('_','').replace('-','').lower()]

if not tcols:
    raise ValueError("Could not find a 'No-show' column. Columns: " + ", ".join(df.columns))

tcol = tcols[0]
print("Using target column:", tcol)

mapping = {'yes': 1, 'no': 0, 'y': 1, 'n': 0, '1': 1, '0': 0}
df["no_show_flag"] = df[tcol].astype(str).str.strip().str.lower().map(mapping)

failed = df["no_show_flag"].isnull().sum()
if failed:
    print(f"Warning: {failed} rows have unmapped target values and will be removed.")
    df = df[df["no_show_flag"].notnull()].reset_index(drop=True)

df["no_show_flag"] = df["no_show_flag"].astype(int)
print("Target distribution (0=attended,1=missed):\n", df["no_show_flag"].value_counts())


Using target column: no_show
Target distribution (0=attended,1=missed):
 no_show_flag
0    88203
1    22311
Name: count, dtype: int64


In [None]:
cat_columns = []
for name in ("gender", "neighbourhood"):
    if name in df.columns:
        cat_columns.append(name)
        df[name] = df[name].fillna("Unknown").astype(str)

print("Categorical columns detected for encoding:", cat_columns)

for c in cat_columns:
    le = LabelEncoder()
    enc_col = c + "_enc"
    df[enc_col] = le.fit_transform(df[c])
    print(f"Encoded {c} -> {enc_col}; classes: {list(le.classes_)[:10]} (showing up to 10)")


Categorical columns detected for encoding: ['gender', 'neighbourhood']
Encoded gender -> gender_enc; classes: ['F', 'M'] (showing up to 10)
Encoded neighbourhood -> neighbourhood_enc; classes: ['AEROPORTO', 'ANDORINHAS', 'ANTÔNIO HONÓRIO', 'ARIOVALDO FAVALESSA', 'BARRO VERMELHO', 'BELA VISTA', 'BENTO FERREIRA', 'BOA VISTA', 'BONFIM', 'CARATOÍRA'] (showing up to 10)


In [None]:
print("Final dataset shape (IDs kept):", df.shape)
print("Columns sample:", df.columns.tolist()[:40])

Final dataset shape (IDs kept): (110514, 18)
Columns sample: ['patientid', 'appointmentid', 'gender', 'scheduledday', 'appointmentday', 'age', 'neighbourhood', 'scholarship', 'hipertension', 'diabetes', 'alcoholism', 'handcap', 'sms_received', 'no_show', 'waiting_days', 'no_show_flag', 'gender_enc', 'neighbourhood_enc']


In [None]:
print("Final dtypes:\n", df.dtypes)
display(df.head())

clean_name = "healthcare_cleaned.csv"
df.to_csv(clean_name, index=False)
print("Saved cleaned file:", clean_name)

files.download(clean_name)


Final dtypes:
 patientid                        float64
appointmentid                      int64
gender                            object
scheduledday         datetime64[ns, UTC]
appointmentday       datetime64[ns, UTC]
age                                int64
neighbourhood                     object
scholarship                        int64
hipertension                       int64
diabetes                           int64
alcoholism                         int64
handcap                            int64
sms_received                       int64
no_show                           object
waiting_days                       int64
no_show_flag                       int64
gender_enc                         int64
neighbourhood_enc                  int64
dtype: object


Unnamed: 0,patientid,appointmentid,gender,scheduledday,appointmentday,age,neighbourhood,scholarship,hipertension,diabetes,alcoholism,handcap,sms_received,no_show,waiting_days,no_show_flag,gender_enc,neighbourhood_enc
0,29872500000000.0,5642903,F,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,62,JARDIM DA PENHA,0,1,0,0,0,0,No,0,0,0,39
1,558997800000000.0,5642503,M,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,0,0,0,0,0,No,0,0,1,39
2,4262962000000.0,5642549,F,2016-04-29 16:19:04+00:00,2016-04-29 00:00:00+00:00,62,MATA DA PRAIA,0,0,0,0,0,0,No,0,0,0,45
3,867951200000.0,5642828,F,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No,0,0,0,54
4,8841186000000.0,5642494,F,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,56,JARDIM DA PENHA,0,1,1,0,0,0,No,0,0,0,39


Saved cleaned file: healthcare_cleaned.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>