In [None]:
#importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [None]:
#loading dataset
df = pd.read_excel('C:\Users\acoll\New folder\Healthcare_dataset.csv')
df.head()

In [None]:
# dataframe shape
print(df.shape)
# checking for duplicates
print(df.Ptid.unique().shape)

In [None]:
# checking for missing variables
df.isnull().sum()

In [None]:
# Check if there any missing values 
ax = df.isna().sum().sort_values().plot(kind = 'barh', figsize = (9, 10))
plt.title('Percentage of Missing Values Per Column in Train Set', fontdict={'size':15})
for p in ax.patches:
    percentage ='{:,.0f}%'.format((p.get_width()/df.shape[0])*100)
    width, height =p.get_width(),p.get_height()
    x=p.get_x()+width+0.02
    y=p.get_y()+height/2
    ax.annotate(percentage,(x,y))

In [None]:
# dropping id column
df.drop(['Ptid'], axis=1, inplace=True)
df.head()

In [None]:
#getting variable dtypes
df.dtypes

In [None]:
cat_cols = list(df.select_dtypes(['object']).columns)
print(len(cat_cols))
cat_cols

In [None]:
num_cols = list(df.select_dtypes(['int64']).columns)
print(len(num_cols))
num_cols

In [None]:
df.describe()

In [None]:
# ploting histogram for numerical columns
def plot_histogram(df, cols, bins=6):
    for col in cols:
        fig = plt.figure(figsize=(6,4))
        ax= fig.gca()
        df[col].plot.hist(ax = ax, bins = bins, color = 'blue')
        ax.set_title('Histogram of ' + col)
        ax.set_xlabel(col)
        ax.set_ylabel('Number')
        plt.show()
plot_histogram(df, num_cols)

In [None]:
# measuring skewness and kurtosis of numerical columns
def measure_skew_kurtosis(cols):
    for col in cols:
        print(col)
        result = df[[col]].agg(['skew', 'kurtosis']).transpose()
        print(result)
measure_skew_kurtosis(num_cols)

In [None]:
# creating a box plot of numerical columns to identify outliers
def boxplot(df, cols):
    for col in cols:
        sns.set_style('whitegrid')
        sns.boxplot(y=col, data=df)
        plt.title('Boxplot of ' + col)
        plt.ylabel(col) #setting text for y axis
        plt.show()
boxplot(df, num_cols)

In [None]:
# min-max normalization
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_min_max = df.copy() #getting a copy of dataframe
for col in num_cols:
    data = np.array(df_min_max[[col]])
    scaler.fit(data)#computing min and max to be used for scaling
    df['min_max_'+col] = scaler.transform(data) 
    measure_skew_kurtosis(['min_max_'+col]) #measure skewness and kurtosis
    plot_histogram(df, ['min_max_'+col])# plot histogram
    boxplot(df, ['min_max_'+col])#plot a box plot

In [None]:
# square root transformation
for col in num_cols:
    df['sqrt_'+col] = np.sqrt(df[col])
    measure_skew_kurtosis(['sqrt_'+col]) #measure skewness and kurtosis
    plot_histogram(df, ['sqrt_'+col]) # plot histogram
    boxplot(df, ['sqrt_'+col])#plot a box plot

In [None]:
# log transformation
for col in num_cols:
    df['log_'+col] = np.log1p(df[col])
    measure_skew_kurtosis(['log_'+col]) #measure skewness and kurtosis
    plot_histogram(df, ['log_'+col]) # plot histogram
    boxplot(df, ['log_'+col])#plot a box plot

In [None]:
#summary of numerical columns
df.describe()

In [None]:
# creating a new numerical list
num_cols = list(df.select_dtypes(['int64', 'float64']).columns)
print(len(num_cols))
num_cols

In [None]:
# creating a box plot of numerical columns against persitency flag to identify outliers
def boxplot(df, cols):
    for col in cols:
        sns.set_style('whitegrid')
        sns.boxplot(x='Persistency_Flag', y=col, data=df)
        plt.title('Boxplot of ' + col)
        plt.ylabel(col) #setting text for y axis
        plt.show()
boxplot(df, num_cols)

EDA

1. Qualitative Analysis

In [None]:
#checking for number of values in categorical columns
df[cat_cols].nunique()

In [None]:
# value counts for categorical variables
def count_value(df, cols):
    for col in cols:
        print('\n' + 'For column ' + col)
        print(df[col].value_counts())
count_value(df, cat_cols)

In [None]:
#bar graph for value counts for categorical columns
def plot_catcols(x, df):
    df['dummy'] = np.ones(shape = df.shape[0])
    for col in x:
        print(col)
        counts = df[['dummy', col]].groupby([col], as_index = False).count()
        fig, ax = plt.subplots(figsize = (8,4))
        graph = plt.barh(counts[col], counts.dummy) #creating a graph
        plt.xticks(rotation=90)
        plt.title('Counts for ' + col)
        plt.xlabel('count')
        #getting percentages
        total = counts['dummy'].sum()
        percentage = []
        for i in range(counts.shape[0]):
            pct = (counts.dummy[i]/total)*100
            percentage.append(round(pct, 2))
        counts['Percentage'] = percentage
        # plotting the graph with percentages
        i = 0
        for p in graph:
            pct = f'{percentage[i]}%'
            width1, height1 =p.get_width(),p.get_height()
            x1 =p.get_x()+width1
            y1=p.get_y()+height1/2
            ax.annotate(pct,(x1,y1))
            i+=1
        plt.show()
plot_catcols(cat_cols, df)

In [None]:
# removing persistency flag form categorical columns
cat_cols.remove('Persistency_Flag')

In [None]:
from IPython.display import display

In [None]:
# visualize class separation by categorical features
for col in cat_cols:
    print(' ')
    print(col)
    print(' ')
    counts = df[['dummy', 'Persistency_Flag', col]].groupby(['Persistency_Flag', col], as_index = False).count()
    display(counts)
    _ = plt.figure(figsize = (10,4))
    plt.subplot(1, 2, 1)
    temp1 = counts[counts['Persistency_Flag'] == 'Non-Persistent'][[col, 'dummy']]
    plt.bar(temp1[col], temp1.dummy)
    plt.xticks(rotation=90)
    plt.title('Counts for ' + col + '\n not persistent')
    plt.ylabel('count')
    plt.subplot(1, 2, 2)
    temp2 = counts[counts['Persistency_Flag'] == 'Persistent'][[col, 'dummy']]
    plt.bar(temp2[col], temp2.dummy)
    plt.xticks(rotation=90)
    plt.title('Counts for ' + col + '\n persistent')
    plt.ylabel('count')
    plt.show()

In [None]:
df.Persistency_Flag = (df.Persistency_Flag == 'Persistent').astype(int)#turning target variable to int

In [None]:
df.Persistency_Flag.head()

In [None]:
#getting global mean for those who are persistent
global_persistency = df.Persistency_Flag.mean()
global_persistency

In [None]:
for col in cat_cols:
    print(col)
    df_group = df.groupby(col).Persistency_Flag.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_persistency
    df_group['risk'] = df_group['mean'] / global_persistency
    display(df_group)
    print()
    print()

In [None]:
df_group.head()

2. Quantitative Analysis

In [None]:
#mutual information scores for the categorical variables
from sklearn.metrics import mutual_info_score
#creating a function
def mutual_info_churn_score(series):
    return mutual_info_score(series, df.Persistency_Flag)

In [None]:
mi = df[cat_cols].apply(mutual_info_churn_score) #getting mutual info scores
pd.set_option('display.max_rows', None) #setting to print all rows
print(mi.sort_values(ascending=False)) #sorting in descending order

In [None]:
#mi scores to dataframe
mi_score = mi.sort_values(ascending=False)
type(mi_score)
mi_df = mi_score.to_frame()
mi_df

In [None]:
#quantitative analysis
#chi square test (test of independence)
#creating a function for chi square test
def chi_square_test(col):
    print('Ho:Persistency is not dependent on '+col)
    print('H1:Persistency is dependent on '+col)
    import scipy.stats as stats #importing stats
    #creating a contigency table
    value_list = df[col].unique().tolist()#creating list of column values
    for value in value_list:
        data_crosstab = pd.crosstab(df['Persistency_Flag']==1,df[col]==value,
                                margins=True, margins_name="Total")
    # significance level
    alpha = 0.05
    # Calcualtion of Chisquare test statistics
    chi_square = 0
    rows = (df['Persistency_Flag']==1).unique()
    columns = (df[col]==value).unique()
    for i in columns:
        for j in rows:
            O = data_crosstab[i][j]
            E = data_crosstab[i]['Total'] * data_crosstab['Total'][j] / data_crosstab['Total']['Total']
            chi_square += (O-E)**2/E
    # The p-value approach
    p_value = 1 - stats.norm.cdf(chi_square, (len(rows)-1)*(len(columns)-1))
    conclusion = "Failed to reject the null hypothesis."
    if p_value <= alpha:
        conclusion = "Null Hypothesis is rejected."

    print("chisquare-score is:", chi_square, " and p value is:", p_value)
    print(conclusion)

In [None]:
#getting chi square test for the categorical columns
for col in cat_cols:
    print(col)
    chi_square_test(col)
    print(' ')

In [None]:
#correlation of numerical columns with persistency
num_corr = df[num_cols].corrwith(df.Persistency_Flag)
#converting to dataframe
corr_df = num_corr.to_frame()
corr_df

In [None]:
corr = df.corr()

In [None]:
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(400, 3000, n=400),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
#cmap = sns.diverging_palette(20, 220, n=400)

# Draw the heatmap with the mask and correct aspect ratio
ax = sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .7})
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);


In [None]:
#ANOVA test
#creating a function for the ANOVA test
def ANOVA(df, col='Dexa_Freq_During_Rx'):
    #creating the hypothesis
    print('Ho:Persistency is not positively correlated with ' + col)
    print('H1:Persistency is positively correlated with ' + col)
    df = df[[col, 'Persistency_Flag']]
    persistent = df[col][df['Persistency_Flag']==1]
    persistent = persistent.to_numpy()
    not_persistent = df[col][df['Persistency_Flag']==0]
    not_persistent = not_persistent.to_numpy()
    #one-way ANOVA
    import scipy.stats as stats
    fvalue, pvalue = stats.f_oneway(persistent, not_persistent)
    print('fvalue='+str(fvalue))
    print('pvalue='+str(pvalue))
    if pvalue <= 0.05:
        print('Null hypothesis is rejected')
    else:
        print('Failed to reject the Null hypothesis')
#getting anova test for the numerical columns
for col in num_cols:
    print(col)
    ANOVA(df, col)
    print(' ')