# Project

In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [40]:
df = pd.read_csv('noshowappointments-kagglev2-may-2016.csv')

In [41]:
df.shape

(110527, 14)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
PatientId         110527 non-null float64
AppointmentID     110527 non-null int64
Gender            110527 non-null object
ScheduledDay      110527 non-null object
AppointmentDay    110527 non-null object
Age               110527 non-null int64
Neighbourhood     110527 non-null object
Scholarship       110527 non-null int64
Hipertension      110527 non-null int64
Diabetes          110527 non-null int64
Alcoholism        110527 non-null int64
Handcap           110527 non-null int64
SMS_received      110527 non-null int64
No-show           110527 non-null object
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


In [43]:
df.head(3)

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No


In [97]:
df.describe()

Unnamed: 0,patient_id,appointment_id,age,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received
count,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0
mean,147496300000000.0,5675305.0,37.088874,0.098266,0.197246,0.071865,0.0304,0.022248,0.321026
std,256094900000000.0,71295.75,23.110205,0.297675,0.397921,0.258265,0.171686,0.161543,0.466873
min,39217.0,5030230.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4172614000000.0,5640286.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,31731840000000.0,5680573.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,94391720000000.0,5725524.0,55.0,0.0,0.0,0.0,0.0,0.0,1.0
max,999981600000000.0,5790484.0,115.0,1.0,1.0,1.0,1.0,4.0,1.0


In [77]:
df.nunique()

patient_id          62299
appointment_id     110527
gender                  2
scheduled_day      103549
appointment_day        27
age                   104
neighborhood           81
scholarship             2
hypertension            2
diabetes                2
alcoholism              2
handcap                 5
sms_received            2
no_show                 2
dtype: int64

In [108]:
df.dtypes

patient_id                       int64
appointment_id                   int64
gender                        category
scheduled_day      datetime64[ns, UTC]
appointment_day    datetime64[ns, UTC]
age                              int64
neighborhood                    object
scholarship                      int64
hypertension                     int64
diabetes                         int64
alcoholism                       int64
handicap                         int64
sms_received                     int64
no_show                         object
dtype: object

In [109]:
# Checking the actual data types of the ones still marked as object:

print('neighborhood: ' + str(type(df['neighborhood'][0])))
print('no_show: ' + str(type(df['no_show'][0])))

neighborhood: <class 'str'>
no_show: <class 'str'>


In [46]:
sum(df.duplicated())

0

In [48]:
# Fixing the column names
df.rename(columns=lambda x: x.strip().lower().replace("-", "_"), inplace=True)

In [76]:
# Checking the column names
df.head(1)

Unnamed: 0,patient_id,appointment_id,gender,scheduled_day,appointment_day,age,neighborhood,scholarship,hypertension,diabetes,alcoholism,handcap,sms_received,no_show
0,29872499824296,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No


In [91]:
# Fixing the column names some more
df.rename(columns={"patientid": "patient_id", "appointmentid": "appointment_id", "scheduledday": "scheduled_day", "appointmentday": "appointment_day", "neighbourhood": "neighborhood", "hipertension": "hypertension", "handcap": "handicap"},inplace=True)


In [122]:
# Changing the data type for 'no-show' to boolean with applied mapping
mapping = {'Yes': True, 'No': False}
df['no_show'] = df['no_show'].map(mapping)

In [141]:
# Changing the type for multiple parameters to boolean
columns_01 = ['scholarship', 'hypertension', 'diabetes', 'alcoholism', 'sms_received']
mapping = {0: False, 1: True}
for x in columns_01:
    df[x] = df[x].map(mapping)

In [143]:
# Checking that the columns now are boolean
df.dtypes

patient_id                       int64
appointment_id                   int64
gender                        category
scheduled_day      datetime64[ns, UTC]
appointment_day    datetime64[ns, UTC]
age                              int64
neighborhood                    object
scholarship                       bool
hypertension                      bool
diabetes                          bool
alcoholism                        bool
handicap                         int64
sms_received                      bool
no_show                           bool
dtype: object

In [68]:
type(df['patient_id'][0])

numpy.float64

In [89]:
# Changing the data type to 'integer' for column 'patient_id'
df['patient_id'] = df['patient_id'].astype(np.int64)

In [92]:
df.dtypes

patient_id                       int64
appointment_id                   int64
gender                        category
scheduled_day      datetime64[ns, UTC]
appointment_day    datetime64[ns, UTC]
age                              int64
neighborhood                    object
scholarship                      int64
hypertension                     int64
diabetes                         int64
alcoholism                       int64
handicap                         int64
sms_received                     int64
no_show                         object
dtype: object

In [87]:
# Changing the data type to 'category' for column 'gender'
df['gender'] = df['gender'].astype('category')

In [88]:
# Changing the data type to 'datetime' for columns 'scheduled_day' and 'appointment_day'
df['scheduled_day'] = pd.to_datetime(df['scheduled_day'])
df['appointment_day'] = pd.to_datetime(df['appointment_day'])

In [117]:
df.groupby('no_show').count()

Unnamed: 0_level_0,patient_id,appointment_id,gender,scheduled_day,appointment_day,age,neighborhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received
no_show,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
No,88208,88208,88208,88208,88208,88208,88208,88208,88208,88208,88208,88208,88208
Yes,22319,22319,22319,22319,22319,22319,22319,22319,22319,22319,22319,22319,22319


In [123]:
df.dtypes

patient_id                       int64
appointment_id                   int64
gender                        category
scheduled_day      datetime64[ns, UTC]
appointment_day    datetime64[ns, UTC]
age                              int64
neighborhood                    object
scholarship                      int64
hypertension                     int64
diabetes                         int64
alcoholism                       int64
handicap                         int64
sms_received                     int64
no_show                           bool
dtype: object