In [41]:
import pandas as pd


In [71]:
### Load the dataset
df = pd.read_csv('/content/KaggleV2-May-2016.csv')

In [72]:
### View the top 5 rows
print(df.head())


### Get column names

print(df.columns.tolist())

      PatientId  AppointmentID Gender          ScheduledDay  \
0  2.987250e+13        5642903      F  2016-04-29T18:38:08Z   
1  5.589978e+14        5642503      M  2016-04-29T16:08:27Z   
2  4.262962e+12        5642549      F  2016-04-29T16:19:04Z   
3  8.679512e+11        5642828      F  2016-04-29T17:29:31Z   
4  8.841186e+12        5642494      F  2016-04-29T16:07:23Z   

         AppointmentDay  Age      Neighbourhood  Scholarship  Hipertension  \
0  2016-04-29T00:00:00Z   62    JARDIM DA PENHA            0             1   
1  2016-04-29T00:00:00Z   56    JARDIM DA PENHA            0             0   
2  2016-04-29T00:00:00Z   62      MATA DA PRAIA            0             0   
3  2016-04-29T00:00:00Z    8  PONTAL DE CAMBURI            0             0   
4  2016-04-29T00:00:00Z   56    JARDIM DA PENHA            0             1   

   Diabetes  Alcoholism  Handcap  SMS_received No-show  
0         0           0        0             0      No  
1         0           0        0      

In [73]:
## Strip whitespace and convert to lowercase

df.columns = df.columns.str.strip().str.lower().str.replace('-', '_')


In [74]:
# Check for missing values

print(df.isnull().sum())


patientid         0
appointmentid     0
gender            0
scheduledday      0
appointmentday    0
age               0
neighbourhood     0
scholarship       0
hipertension      0
diabetes          0
alcoholism        0
handcap           0
sms_received      0
no_show           0
dtype: int64


In [75]:
# Check for duplicates
print(df.duplicated().sum())

0


In [76]:
# Convert to datetime
df['scheduledday'] = pd.to_datetime(df['scheduledday'])
df['appointmentday'] = pd.to_datetime(df['appointmentday'])


In [78]:
print(df['scheduledday'].head())

0   2016-04-29 18:38:08+00:00
1   2016-04-29 16:08:27+00:00
2   2016-04-29 16:19:04+00:00
3   2016-04-29 17:29:31+00:00
4   2016-04-29 16:07:23+00:00
Name: scheduledday, dtype: datetime64[ns, UTC]


In [82]:
# Rename columns for clarity
df.rename(columns={
    'hipertension': 'hypertension',
    'handcap': 'handicap',
    'no_show': 'no_show'
}, inplace=True)


In [83]:
df.column = [col.replace('_', ' ').title() for col in df.columns]


In [84]:
print(df.head())

      Patientid  Appointmentid Gender              Scheduledday  \
0  2.987250e+13        5642903      F 2016-04-29 18:38:08+00:00   
1  5.589978e+14        5642503      M 2016-04-29 16:08:27+00:00   
2  4.262962e+12        5642549      F 2016-04-29 16:19:04+00:00   
3  8.679512e+11        5642828      F 2016-04-29 17:29:31+00:00   
4  8.841186e+12        5642494      F 2016-04-29 16:07:23+00:00   

             Appointmentday  Age      Neighbourhood  Scholarship  \
0 2016-04-29 00:00:00+00:00   62    JARDIM DA PENHA            0   
1 2016-04-29 00:00:00+00:00   56    JARDIM DA PENHA            0   
2 2016-04-29 00:00:00+00:00   62      MATA DA PRAIA            0   
3 2016-04-29 00:00:00+00:00    8  PONTAL DE CAMBURI            0   
4 2016-04-29 00:00:00+00:00   56    JARDIM DA PENHA            0   

   Hypertension  Diabetes  Alcoholism  Handicap  Sms Received No Show  
0             1         0           0         0             0      No  
1             0         0           0       

In [85]:
# Convert 'age' to integer
df['Age'] = df['Age'].astype(int)


In [86]:
print(df['Age'].head())

0    62
1    56
2    62
3     8
4    56
Name: Age, dtype: int64


In [88]:
# Remove rows with negative ages
df = df[df['Age'] >= 0]


Summary of Changes:

Stripped whitespace and standardized column names.

Handled missing values (if any).

Removed duplicate rows.

Standardized text values in categorical columns.

Converted date columns to datetime format.

Renamed columns for clarity and consistency.

Ensured correct data types for numerical columns.

Removed rows with invalid data (e.g., negative ages).​
GitHub