In [17]:
import pandas as pd

# Load dataset
df = pd.read_csv(r"C:\Users\user\Desktop\data.csv")

# Show basic info
print("Initial Shape:", df.shape)
print("\nData Types:\n", df.dtypes)
print("\nFirst 5 Rows:\n", df.head())
df

Initial Shape: (110527, 14)

Data Types:
 PatientId         float64
AppointmentID       int64
Gender             object
ScheduledDay       object
AppointmentDay     object
Age                 int64
Neighbourhood      object
Scholarship         int64
Hipertension        int64
Diabetes            int64
Alcoholism          int64
Handcap             int64
SMS_received        int64
No-show            object
dtype: object

First 5 Rows:
       PatientId  AppointmentID Gender          ScheduledDay  \
0  2.987250e+13        5642903      F  2016-04-29T18:38:08Z   
1  5.589978e+14        5642503      M  2016-04-29T16:08:27Z   
2  4.262962e+12        5642549      F  2016-04-29T16:19:04Z   
3  8.679512e+11        5642828      F  2016-04-29T17:29:31Z   
4  8.841186e+12        5642494      F  2016-04-29T16:07:23Z   

         AppointmentDay  Age      Neighbourhood  Scholarship  Hipertension  \
0  2016-04-29T00:00:00Z   62    JARDIM DA PENHA            0             1   
1  2016-04-29T00:00:00Z   56 

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,2.987250e+13,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,5.589978e+14,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4.262962e+12,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,8.679512e+11,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8.841186e+12,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110522,2.572134e+12,5651768,F,2016-05-03T09:15:35Z,2016-06-07T00:00:00Z,56,MARIA ORTIZ,0,0,0,0,0,1,No
110523,3.596266e+12,5650093,F,2016-05-03T07:27:33Z,2016-06-07T00:00:00Z,51,MARIA ORTIZ,0,0,0,0,0,1,No
110524,1.557663e+13,5630692,F,2016-04-27T16:03:52Z,2016-06-07T00:00:00Z,21,MARIA ORTIZ,0,0,0,0,0,1,No
110525,9.213493e+13,5630323,F,2016-04-27T15:09:23Z,2016-06-07T00:00:00Z,38,MARIA ORTIZ,0,0,0,0,0,1,No


In [8]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)


Missing Values:
 PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64


In [9]:
# Check number of duplicates
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

Duplicate rows: 0


In [10]:
# Remove duplicates
df = df.drop_duplicates()
print("Shape after removing duplicates:", df.shape)

Shape after removing duplicates: (110527, 14)


In [11]:
# Clean column names: lowercase and replace spaces with underscores
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

print("Updated column names:\n", df.columns)


Updated column names:
 Index(['patientid', 'appointmentid', 'gender', 'scheduledday',
       'appointmentday', 'age', 'neighbourhood', 'scholarship', 'hipertension',
       'diabetes', 'alcoholism', 'handcap', 'sms_received', 'no-show'],
      dtype='object')


In [12]:
# Convert to datetime
df['scheduledday'] = pd.to_datetime(df['scheduledday'])
df['appointmentday'] = pd.to_datetime(df['appointmentday'])

print(df[['scheduledday', 'appointmentday']].dtypes)


scheduledday      datetime64[ns, UTC]
appointmentday    datetime64[ns, UTC]
dtype: object


In [13]:
# Map 'No' to 'Showed Up' and 'Yes' to 'No Show'
df['status'] = df['no-show'].map({'No': 'Showed Up', 'Yes': 'No Show'})

# Drop the original 'no-show' column if needed
df.drop(columns=['no-show'], inplace=True)

# Check result
print(df['status'].value_counts())


status
Showed Up    88208
No Show      22319
Name: count, dtype: int64


In [15]:
# Check age range
print("Age range:", df['age'].min(), "to", df['age'].max())

# Show any rows with invalid age (e.g., negative)
invalid_ages = df[df['age'] < 0]
print("Invalid ages:\n", invalid_ages)

# Remove rows with age < 0
df = df[df['age'] >= 0]
print("Shape after removing invalid ages:", df.shape)


Age range: -1 to 115
Invalid ages:
           patientid  appointmentid gender              scheduledday  \
99832  4.659432e+14        5775010      F 2016-06-06 08:58:13+00:00   

                 appointmentday  age neighbourhood  scholarship  hipertension  \
99832 2016-06-06 00:00:00+00:00   -1         ROMÃO            0             0   

       diabetes  alcoholism  handcap  sms_received     status  
99832         0           0        0             0  Showed Up  
Shape after removing invalid ages: (110526, 14)


In [16]:
# Check final data types
print(df.dtypes)

# Final shape
print("Final dataset shape:", df.shape)

# Quick summary
print(df.describe(include='all'))


patientid                     float64
appointmentid                   int64
gender                         object
scheduledday      datetime64[ns, UTC]
appointmentday    datetime64[ns, UTC]
age                             int64
neighbourhood                  object
scholarship                     int64
hipertension                    int64
diabetes                        int64
alcoholism                      int64
handcap                         int64
sms_received                    int64
status                         object
dtype: object
Final dataset shape: (110526, 14)
           patientid  appointmentid  gender  \
count   1.105260e+05   1.105260e+05  110526   
unique           NaN            NaN       2   
top              NaN            NaN       F   
freq             NaN            NaN   71839   
mean    1.474934e+14   5.675304e+06     NaN   
min     3.921784e+04   5.030230e+06     NaN   
25%     4.172536e+12   5.640285e+06     NaN   
50%     3.173184e+13   5.680572e+06     NaN 