In [113]:
import pandas as pd
import datetime
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../data/show_no_show.csv', index_col='Unnamed: 0')

In [3]:
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,,0,0,No


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110182 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  object 
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(7), object(6)
memory usage: 12.6+ MB


In [5]:
df.dropna(inplace=True)

# ScheduledDay & AppointmentDay into datetime

In [6]:
def to_datetime(ex):
    return datetime.datetime(int(ex[0:4]), int(ex[5:7]), int(ex[8:10]), int(ex[11:13]), int(ex[14:16]), int(ex[17:19]))

In [7]:
df['ScheduledDay_hours'] = df['ScheduledDay']

In [8]:
df['ScheduledDay']=df['ScheduledDay'].apply(lambda ex: datetime.date(int(ex[0:4]), int(ex[5:7]), int(ex[8:10])))

In [9]:
df['ScheduledDay_hours']=df['ScheduledDay_hours'].apply(lambda ex: int(ex[11:13]))

In [10]:
df['AppointmentDay']=df['AppointmentDay'].apply(lambda ex: datetime.date(int(ex[0:4]), int(ex[5:7]), int(ex[8:10])))

In [11]:
df['DaysDifference']= df['AppointmentDay']-df['ScheduledDay']

In [12]:
df['DaysDifference_Num'] = df['DaysDifference'].dt.days.astype('int16')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110182 entries, 0 to 110526
Data columns (total 17 columns):
 #   Column              Non-Null Count   Dtype          
---  ------              --------------   -----          
 0   PatientId           110182 non-null  float64        
 1   AppointmentID       110182 non-null  int64          
 2   Gender              110182 non-null  object         
 3   ScheduledDay        110182 non-null  object         
 4   AppointmentDay      110182 non-null  object         
 5   Age                 110182 non-null  int64          
 6   Neighbourhood       110182 non-null  object         
 7   Scholarship         110182 non-null  int64          
 8   Hipertension        110182 non-null  int64          
 9   Diabetes            110182 non-null  int64          
 10  Alcoholism          110182 non-null  object         
 11  Handcap             110182 non-null  int64          
 12  SMS_received        110182 non-null  int64          
 13  No-show       

In [14]:
df.tail()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,ScheduledDay_hours,DaysDifference,DaysDifference_Num
110522,2572134000000.0,5651768,F,2016-05-03,2016-06-07,56,MARIA ORTIZ,0,0,0,,0,1,No,9,35 days,35
110523,3596266000000.0,5650093,F,2016-05-03,2016-06-07,51,MARIA ORTIZ,0,0,0,,0,1,No,7,35 days,35
110524,15576630000000.0,5630692,F,2016-04-27,2016-06-07,21,MARIA ORTIZ,0,0,0,,0,1,No,16,41 days,41
110525,92134930000000.0,5630323,F,2016-04-27,2016-06-07,38,MARIA ORTIZ,0,0,0,,0,1,No,15,41 days,41
110526,377511500000000.0,5629448,F,2016-04-27,2016-06-07,54,MARIA ORTIZ,0,0,0,,0,1,No,13,41 days,41


# Gender into category

In [15]:
df["Alcoholism_Num"] = df["Alcoholism"].replace("None", 0).replace("Low",1).replace("Moderate", 2).replace("High", 3)

In [16]:
df.columns

Index(['PatientId', 'AppointmentID', 'Gender', 'ScheduledDay',
       'AppointmentDay', 'Age', 'Neighbourhood', 'Scholarship', 'Hipertension',
       'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received', 'No-show',
       'ScheduledDay_hours', 'DaysDifference', 'DaysDifference_Num',
       'Alcoholism_Num'],
      dtype='object')

In [17]:
df["No-show"].value_counts()

No     87951
Yes    22231
Name: No-show, dtype: int64

In [18]:
df["Attendance"] = df["No-show"].replace("No", 1).replace("Yes", 0) 

In [19]:
df["No-show_Num"] = df["No-show"].replace("No", 0).replace("Yes", 1) 

In [20]:
df["Gender_Num"] = df["Gender"].replace("M", 0).replace("F", 1) 

In [21]:
df["Neighbourhood"].value_counts()

JARDIM CAMBURI                 7698
MARIA ORTIZ                    5795
RESISTÊNCIA                    4411
JARDIM DA PENHA                3865
ITARARÉ                        3500
                               ... 
ILHA DO BOI                      35
ILHA DO FRADE                    10
AEROPORTO                         8
ILHAS OCEÂNICAS DE TRINDADE       2
PARQUE INDUSTRIAL                 1
Name: Neighbourhood, Length: 81, dtype: int64

In [66]:
df_mean = df[["Neighbourhood","No-show_Num"]].groupby("Neighbourhood").mean("No-show_Num")

In [23]:
df = df.join(df_mean, on = "Neighbourhood", rsuffix = "_Neighbourhood", how = "left")

In [24]:
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,...,SMS_received,No-show,ScheduledDay_hours,DaysDifference,DaysDifference_Num,Alcoholism_Num,Attendance,No-show_Num,Gender_Num,No-show_Num_Neighbourhood
0,29872500000000.0,5642903,F,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,...,0,No,18,0 days,0,0,1,0,1,0.163001
1,558997800000000.0,5642503,M,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,...,0,No,16,0 days,0,0,1,0,0,0.163001
2,4262962000000.0,5642549,F,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,...,0,No,16,0 days,0,0,1,0,1,0.169782
3,867951200000.0,5642828,F,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,...,0,No,17,0 days,0,0,1,0,1,0.176471
4,8841186000000.0,5642494,F,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,...,0,No,16,0 days,0,0,1,0,1,0.163001


In [25]:
df.columns

Index(['PatientId', 'AppointmentID', 'Gender', 'ScheduledDay',
       'AppointmentDay', 'Age', 'Neighbourhood', 'Scholarship', 'Hipertension',
       'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received', 'No-show',
       'ScheduledDay_hours', 'DaysDifference', 'DaysDifference_Num',
       'Alcoholism_Num', 'Attendance', 'No-show_Num', 'Gender_Num',
       'No-show_Num_Neighbourhood'],
      dtype='object')

In [68]:
X = df[['Gender_Num', 'Age', 'Scholarship', 'Hipertension','Diabetes', 'Handcap', 'SMS_received', 'ScheduledDay_hours', 'DaysDifference_Num', 'Alcoholism_Num', 'No-show_Num_Neighbourhood']]

In [69]:
y = df['No-show_Num']

In [64]:
df["Neighbourhood"].value_counts()

JARDIM CAMBURI                 7698
MARIA ORTIZ                    5795
RESISTÊNCIA                    4411
JARDIM DA PENHA                3865
ITARARÉ                        3500
                               ... 
ILHA DO BOI                      35
ILHA DO FRADE                    10
AEROPORTO                         8
ILHAS OCEÂNICAS DE TRINDADE       2
PARQUE INDUSTRIAL                 1
Name: Neighbourhood, Length: 81, dtype: int64

In [27]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110182 entries, 0 to 110526
Data columns (total 11 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Gender_Num                 110182 non-null  int64  
 1   Age                        110182 non-null  int64  
 2   Scholarship                110182 non-null  int64  
 3   Hipertension               110182 non-null  int64  
 4   Diabetes                   110182 non-null  int64  
 5   Handcap                    110182 non-null  int64  
 6   SMS_received               110182 non-null  int64  
 7   ScheduledDay_hours         110182 non-null  int64  
 8   DaysDifference_Num         110182 non-null  int16  
 9   Alcoholism_Num             110182 non-null  int64  
 10  No-show_Num_Neighbourhood  110182 non-null  float64
dtypes: float64(1), int16(1), int64(9)
memory usage: 9.5 MB


# Feature selection

In [37]:
VIF = [variance_inflation_factor(X.values, idx) for idx in range(X.shape[1])]

In [87]:
VIF # El mas grande nos dice que es el que mas facil de predecir a partir de las otras otras variables explicativas. Sería redundante incluirla. La quitamos y volvemos a calcular el VIF. Quitamos cuando el valor es >6

[2.7453445404091634,
 4.2856623285860245,
 1.1301608445546416,
 1.8883585675680872,
 1.3397864392672458,
 1.0293662555258905,
 1.7449879376172546,
 4.283374204397784,
 1.7302949154104328,
 1.0514054962718808]

In [None]:
# Logistic, SVC, random forest, gaussian NB (iría bien ya que es un problema de reglas)

## Feature selection + Logistic Regression

In [76]:
selector = SelectFromModel(LogisticRegression(max_iter=1000)).fit(X.values[:,:-1],y)

In [79]:
selector.get_support()

array([False, False,  True, False,  True, False,  True, False, False,
        True])

In [82]:
X_new = selector.transform(X.values[:,:-1])

In [80]:
selector.threshold_

0.09159882049807908

In [83]:
selector.estimator_.score(X.values[:,:-1],y)

0.794694233177833

# Train/Test

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)

# Gaussian

In [94]:
GNB = GaussianNB()
model = GNB.fit(X, y)

In [98]:
model.score(X,y)

0.7721497159245612

# Dealing with imbalanced data set (undersampled)

In [100]:
under_sampler = RandomUnderSampler()

In [101]:
X_res, y_res = under_sampler.fit_resample(X, y)

In [102]:
y_res.value_counts()

1    22231
0    22231
Name: No-show_Num, dtype: int64

In [106]:
GNB = GaussianNB()
model = GNB.fit(X_res, y_res)

In [107]:
model.score(X_res,y_res)

0.6093293149206064

# Random forest 

In [120]:
forest = RandomForestClassifier()
model = forest.fit(X_train,y_train)
model.score(X_test,y_test)

0.786484804480863