# Overview
The purpose of this analysis is to identify which features in the readmissions dataset contribute most to the target variable (Readmitted within 30 days). The Chi-square test determines if there is a significant relationship between the input variables and output variable. The SelectKBest function selects the K number of top features (k=27) based on the Chi-square score.<br>
This analysis uses the CSV file produced by the synthetic_feature_selection_input Jupyter Notebook

# Load libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from pathlib import Path

import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.pyplot import figure
from matplotlib import rcParams
matplotlib.rcParams['figure.figsize'] = (12,8)
from sklearn.metrics import confusion_matrix as cm
import sklearn.metrics as metrics
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import time
import math

from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from sklearn.feature_selection import chi2, SelectKBest

# Load data

In [2]:
# load data
synthetic_feature_selection = pd.read_csv('data/synthetic_feature_selection_input.csv')

In [3]:
# categorize age group columns and assign to values 1-5.
# np.where(condition, value if true, value if false)
synthetic_feature_selection['AGE_GROUP_0-64'] = np.where(synthetic_feature_selection['AGE_GROUP_0-64']==0,0,1)
synthetic_feature_selection['AGE_GROUP_65-69'] = np.where(synthetic_feature_selection['AGE_GROUP_65-69']==0,0,2)
synthetic_feature_selection['AGE_GROUP_70-74'] = np.where(synthetic_feature_selection['AGE_GROUP_70-74']==0,0,3)
synthetic_feature_selection['AGE_GROUP_75-79'] = np.where(synthetic_feature_selection['AGE_GROUP_75-79']==0,0,4)
synthetic_feature_selection['AGE_GROUP_80-max'] = np.where(synthetic_feature_selection['AGE_GROUP_80-max']==0,0,5)

# Data cleaning

In [4]:
# define function that iterates through each row of age group columns
def label_age(row):
    if row['AGE_GROUP_0-64']==1:
        return '1'
    if row['AGE_GROUP_65-69']==2:
        return '2'
    if row['AGE_GROUP_70-74']==3:
        return '3'
    if row['AGE_GROUP_75-79']==4:
        return '4'
    if row['AGE_GROUP_80-max']==5:
        return '5'  
    return '0'

In [5]:
# define function that iterates through each row of age group columns
def label_gender(row):
    if row['SEX_IDENT_CD_1']==1:
        return '1'
    if row['SEX_IDENT_CD_2']==2:
        return '2'
    return '0'

In [6]:
def label_food_insecurity(row):
    if row['FOOD_INSECURITY_GROUP_low']==1:
        return '1'
    if row['FOOD_INSECURITY_GROUP_medium']==1:
        return '2'
    if row['FOOD_INSECURITY_GROUP_high']==1:
        return '3'
    return '0'

In [7]:
def label_no_vehicle_access(row):
    if row['NO_VEHICLE_ACCESS_GROUP_low']==1:
        return '1'
    if row['NO_VEHICLE_ACCESS_GROUP_medium']==1:
        return '2'
    if row['NO_VEHICLE_ACCESS_GROUP_high']==1:
        return '3'
    return '0'

In [8]:
def label_severe_housing_cost_burden(row):
    if row['SEVERE_HOUSING_COST_BURDEN_GROUP_low']==1:
        return '1'
    if row['SEVERE_HOUSING_COST_BURDEN_GROUP_medium']==1:
        return '2'
    if row['SEVERE_HOUSING_COST_BURDEN_GROUP_high']==1:
        return '3'
    return '0'

In [9]:
synthetic_feature_selection.apply(lambda row:label_age(row), axis=1)
synthetic_feature_selection['age_group'] = synthetic_feature_selection.apply (lambda row: label_age(row), axis=1)


In [10]:
synthetic_feature_selection.apply(lambda row:label_gender(row), axis=1)
synthetic_feature_selection['gender'] = synthetic_feature_selection.apply (lambda row: label_gender(row), axis=1)

In [11]:
synthetic_feature_selection.apply(lambda row:label_food_insecurity(row), axis=1)
synthetic_feature_selection['food_insecurity'] = synthetic_feature_selection.apply (lambda row: label_food_insecurity(row), axis=1)

In [12]:
synthetic_feature_selection.apply(lambda row:label_no_vehicle_access(row), axis=1)
synthetic_feature_selection['no_vehicle_access'] = synthetic_feature_selection.apply (lambda row: label_no_vehicle_access(row), axis=1)

In [13]:
synthetic_feature_selection.apply(lambda row:label_severe_housing_cost_burden(row), axis=1)
synthetic_feature_selection['severe_housing_cost_burden'] = synthetic_feature_selection.apply (lambda row: label_severe_housing_cost_burden(row), axis=1)

In [14]:
cols_cat = [
 'age_group',
 'food_insecurity',
 'gender',
 'no_vehicle_access',
 'severe_housing_cost_burden',
 'ad',
 'afib_flutter',
 'ami_ca',
 'anemia',
 'asthma',
 'breast_ca',
 'ckd',
 'colorectal_ca',
 'copd',
 'dep_bipolar_others',
 'dm',
 'hf_non_ischemic_hd',
 'hlp',
 'ht',
 'htn',
 'ihd',
 'lung_ca',
 'nad',
 'or_with_without_pathological_fx',
 'pneumonia_all_cause',
 'prostate_ca',
 'ra_oa']

In [15]:
# convert object type to category
synthetic_feature_selection[cols_cat] = synthetic_feature_selection[cols_cat].apply(lambda x:x.astype("category"))

In [16]:
# encode
synthetic_feature_selection[cols_cat] = synthetic_feature_selection[cols_cat].apply(lambda x: x.cat.codes)

In [17]:
# check datatype is int
synthetic_feature_selection.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49352 entries, 0 to 49351
Columns: 517 entries, SEX_IDENT_CD_1 to severe_housing_cost_burden
dtypes: int32(5), int64(485), int8(27)
memory usage: 184.8 MB


In [18]:
# assign X (input) and y (output) 
X = synthetic_feature_selection[cols_cat]
y = synthetic_feature_selection["OUTPUT_LABEL_30"]

# Feature selection

In [19]:
# feature selection using SelectKBest and chi2
selector = SelectKBest(score_func=chi2,k=27)
selector.fit(X,y)
feature_score = pd.DataFrame({"Score":selector.scores_,"P-Value":selector.pvalues_}, index=X.columns)
feature_score.nlargest(n=27, columns="Score")

Unnamed: 0,Score,P-Value
ckd,1782.8706,0.0
age_group,1713.584191,0.0
ihd,1037.379904,1.347006e-227
dm,1011.420315,5.914460000000001e-222
hlp,759.030382,4.364022e-167
prostate_ca,711.788417,8.17082e-157
nad,674.569177,1.013794e-148
ad,612.622586,3.008384e-135
afib_flutter,541.39204,9.388537000000001e-120
lung_ca,404.259219,6.512806999999999e-90
