In [165]:
# IMPORTING REQUIRED LIBRARIES

import numpy as np
import pandas as pd

In [166]:
# LOAD THE DATA
df = pd.read_csv(r"D:\Datasets\KaggleV2-May-2016.csv")

In [167]:
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [168]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


In [169]:
df.shape

(110527, 14)

In [170]:
# CHECK THE MISSING VALUES
df.isnull().sum()

PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64

#### No Missing Values

In [171]:
# DROPPING DUPLICATE ROWS

df[df.duplicated()]

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show


#### No duplicate ROWS in the dataset

In [172]:
df.dtypes

PatientId         float64
AppointmentID       int64
Gender             object
ScheduledDay       object
AppointmentDay     object
Age                 int64
Neighbourhood      object
Scholarship         int64
Hipertension        int64
Diabetes            int64
Alcoholism          int64
Handcap             int64
SMS_received        int64
No-show            object
dtype: object

In [173]:
# fixing Data types
df['PatientId'] = df['PatientId'].astype('int64').astype('str')
df['AppointmentID'] = df['AppointmentID'].astype('int64').astype('str')

In [174]:
# Fixing date formats
df["AppointmentDay"] = pd.to_datetime(df["AppointmentDay"])
df["ScheduledDay"] = pd.to_datetime(df["ScheduledDay"])

In [175]:
df.dtypes

PatientId                      object
AppointmentID                  object
Gender                         object
ScheduledDay      datetime64[ns, UTC]
AppointmentDay    datetime64[ns, UTC]
Age                             int64
Neighbourhood                  object
Scholarship                     int64
Hipertension                    int64
Diabetes                        int64
Alcoholism                      int64
Handcap                         int64
SMS_received                    int64
No-show                        object
dtype: object

In [176]:
for col in df.select_dtypes(include="object"):
    df[col] = df[col].str.strip()     # remove leading/trailing spaces
    df[col] = df[col].str.title()     # lower-case everything
    df[col] = df[col].str.replace(r"\s+", " ", regex=True)  # fix extra spaces

In [177]:
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872499824296,5642903,F,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,62,Jardim Da Penha,0,1,0,0,0,0,No
1,558997776694438,5642503,M,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,56,Jardim Da Penha,0,0,0,0,0,0,No
2,4262962299951,5642549,F,2016-04-29 16:19:04+00:00,2016-04-29 00:00:00+00:00,62,Mata Da Praia,0,0,0,0,0,0,No
3,867951213174,5642828,F,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,8,Pontal De Camburi,0,0,0,0,0,0,No
4,8841186448183,5642494,F,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,56,Jardim Da Penha,0,1,1,0,0,0,No


In [178]:
# Standardize Gender Column (F → Female, M → Male)
# Make sure the column is clean text
df['Gender'] = df['Gender'].astype(str).str.strip().str.upper()

# Replace F/M with full form
df['Gender'] = df['Gender'].replace({
    "M": "Male",
    "F": "Female"
})

In [179]:
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872499824296,5642903,Female,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,62,Jardim Da Penha,0,1,0,0,0,0,No
1,558997776694438,5642503,Male,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,56,Jardim Da Penha,0,0,0,0,0,0,No
2,4262962299951,5642549,Female,2016-04-29 16:19:04+00:00,2016-04-29 00:00:00+00:00,62,Mata Da Praia,0,0,0,0,0,0,No
3,867951213174,5642828,Female,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,8,Pontal De Camburi,0,0,0,0,0,0,No
4,8841186448183,5642494,Female,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,56,Jardim Da Penha,0,1,1,0,0,0,No


In [180]:
df['Gender'] = df['Gender'].astype('category')
# Convert to Category Type
# (Helpful for analysis + saves memory)

In [181]:
print(df['Gender'].value_counts())
print(df[['Gender']].head())

Gender
Female    71840
Male      38687
Name: count, dtype: int64
   Gender
0  Female
1    Male
2  Female
3  Female
4  Female


In [182]:
df['Age'].describe()

count    110527.000000
mean         37.088874
std          23.110205
min          -1.000000
25%          18.000000
50%          37.000000
75%          55.000000
max         115.000000
Name: Age, dtype: float64

In [183]:
df[df['Age']<0].shape

(1, 14)

In [184]:
df = df[df['Age']>=0]

In [185]:
df.shape

(110526, 14)

In [186]:
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872499824296,5642903,Female,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,62,Jardim Da Penha,0,1,0,0,0,0,No
1,558997776694438,5642503,Male,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,56,Jardim Da Penha,0,0,0,0,0,0,No
2,4262962299951,5642549,Female,2016-04-29 16:19:04+00:00,2016-04-29 00:00:00+00:00,62,Mata Da Praia,0,0,0,0,0,0,No
3,867951213174,5642828,Female,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,8,Pontal De Camburi,0,0,0,0,0,0,No
4,8841186448183,5642494,Female,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,56,Jardim Da Penha,0,1,1,0,0,0,No


In [187]:
df.rename(columns={"Hipertension":"Hypertension"},inplace=True)
df.rename(columns={"No-show":"No_show"},inplace=True)
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handcap,SMS_received,No_show
0,29872499824296,5642903,Female,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,62,Jardim Da Penha,0,1,0,0,0,0,No
1,558997776694438,5642503,Male,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,56,Jardim Da Penha,0,0,0,0,0,0,No
2,4262962299951,5642549,Female,2016-04-29 16:19:04+00:00,2016-04-29 00:00:00+00:00,62,Mata Da Praia,0,0,0,0,0,0,No
3,867951213174,5642828,Female,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,8,Pontal De Camburi,0,0,0,0,0,0,No
4,8841186448183,5642494,Female,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,56,Jardim Da Penha,0,1,1,0,0,0,No


In [188]:

# Clean all column names uniformly
df.columns = df.columns.str.strip().str.replace(" ", "_")

In [189]:
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handcap,SMS_received,No_show
0,29872499824296,5642903,Female,2016-04-29 18:38:08+00:00,2016-04-29 00:00:00+00:00,62,Jardim Da Penha,0,1,0,0,0,0,No
1,558997776694438,5642503,Male,2016-04-29 16:08:27+00:00,2016-04-29 00:00:00+00:00,56,Jardim Da Penha,0,0,0,0,0,0,No
2,4262962299951,5642549,Female,2016-04-29 16:19:04+00:00,2016-04-29 00:00:00+00:00,62,Mata Da Praia,0,0,0,0,0,0,No
3,867951213174,5642828,Female,2016-04-29 17:29:31+00:00,2016-04-29 00:00:00+00:00,8,Pontal De Camburi,0,0,0,0,0,0,No
4,8841186448183,5642494,Female,2016-04-29 16:07:23+00:00,2016-04-29 00:00:00+00:00,56,Jardim Da Penha,0,1,1,0,0,0,No
