In [1]:
import pandas as pd 
import numpy as np
from scipy.stats import chi2_contingency
from statsmodels.stats.proportion import proportions_ztest
import statsmodels.api as sm
df_temp = pd.read_csv('../db/heart_data_cleaned.csv')


In [2]:
df1_temp =  df_temp[df_temp.columns].replace({'Yes':1, 'No':0, 'Male':1,'Female':0,'No, borderline diabetes':'0','Yes (during pregnancy)':'1' })
df1_temp.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,26.58,1,0,0,20.0,30.0,0,1,65-69,White,1,1,Fair,8.0,1,0,0
1,0,24.21,0,0,0,0.0,0.0,0,0,75-79,White,0,0,Good,6.0,0,0,1
2,0,23.71,0,0,0,28.0,0.0,1,0,40-44,White,0,1,Very good,8.0,0,0,0
3,1,28.87,1,0,0,6.0,0.0,1,0,75-79,Black,0,0,Fair,12.0,0,0,0
4,0,21.63,0,0,0,15.0,0.0,0,0,70-74,White,0,1,Fair,4.0,1,0,1


In [3]:
#SkinCancer
filtered_df = df1_temp[(df1_temp['SkinCancer'] == 0) | (df1_temp['SkinCancer'] == 1)]

proportion_heart_disease = filtered_df.groupby('SkinCancer')['HeartDisease'].mean()
count_heart_disease = filtered_df.groupby('SkinCancer')['HeartDisease'].sum()
total_count = count_heart_disease.sum()
output_sentence = f"{count_heart_disease[0] / total_count * 100:.2f}% of individuals who did not had SkinCancer have heart disease\n{count_heart_disease[1] / total_count * 100:.2f}% of individuals who did had SkinCancer have heart disease"

contingency_table = pd.crosstab([df1_temp['SkinCancer']], [df1_temp['HeartDisease']])
chi2, p, _, _ = chi2_contingency(contingency_table)

proportion_heart_disease = contingency_table.iloc[1, :] / contingency_table.sum(axis=0)

percentage_difference = (proportion_heart_disease[1] - proportion_heart_disease[0]) * 100
count_heart_disease = contingency_table.iloc[1, :]
print("Analysis of Skin Cancer\n")
print(f"Count of individuals with heart disease who did not had SkinCancer: {count_heart_disease[0]}")
print(f"Count of individuals with heart disease who did had SkinCancer: {count_heart_disease[1]}")
print(output_sentence)
print(f"Percentage difference in individuals with heart disease between those who had SkinCancer and did not had SkinCancer: {percentage_difference:.2f}%")
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p:.10f}")

alpha = 0.05
if p < alpha:
    print("There is a significant relationship between SkinCancer and heart disease.")
else:
    print("There is no significant relationship between SkinCancer and heart disease.")
    


Analysis of Skin Cancer

Count of individuals with heart disease who did not had SkinCancer: 22143
Count of individuals with heart disease who did had SkinCancer: 4580
81.77% of individuals who did not had SkinCancer have heart disease
18.23% of individuals who did had SkinCancer have heart disease
Percentage difference in individuals with heart disease between those who had SkinCancer and did not had SkinCancer: 9.74%
Chi-square statistic: 2563.202979300436
P-value: 0.0000000000
There is a significant relationship between SkinCancer and heart disease.
	88.16% of individuals who did not had SkinCancer have heart disease
18.23% of individuals who did had SkinCancer have heart disease


In [4]:
#KidneyDisease
filtered_df = df1_temp[(df1_temp['KidneyDisease'] == 0) | (df1_temp['KidneyDisease'] == 1)]

proportion_heart_disease = filtered_df.groupby('KidneyDisease')['HeartDisease'].mean()
count_heart_disease = filtered_df.groupby('KidneyDisease')['HeartDisease'].sum()
total_count = count_heart_disease.sum()
output_sentence = f"{count_heart_disease[0] / total_count * 100:.2f}% of individuals who did not had Asthma have heart disease\n{count_heart_disease[1] / total_count * 100:.2f}% of individuals who did had KidneyDisease have heart disease"

contingency_table = pd.crosstab([df1_temp['KidneyDisease']], [df1_temp['HeartDisease']])
chi2, p, _, _ = chi2_contingency(contingency_table)

proportion_heart_disease = contingency_table.iloc[1, :] / contingency_table.sum(axis=0)

percentage_difference = (proportion_heart_disease[1] - proportion_heart_disease[0]) * 100
count_heart_disease = contingency_table.iloc[1, :]
print("Analysis of Kidney Disease\n")
print(f"Count of individuals with heart disease who did not had KidneyDisease: {count_heart_disease[0]}")
print(f"Count of individuals with heart disease who did had KidneyDisease: {count_heart_disease[1]}")
print(output_sentence)
print(f"Percentage difference in individuals with heart disease between those who had KidneyDisease and did not had KidneyDisease: {percentage_difference:.2f}%")
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p:.10f}")

alpha = 0.05
if p < alpha:
    print("There is a significant relationship between KidneyDisease and heart disease.")
else:
    print("There is no significant relationship between KidneyDisease and heart disease.")

Analysis of Kidney Disease

Count of individuals with heart disease who did not had KidneyDisease: 7539
Count of individuals with heart disease who did had KidneyDisease: 3180
87.34% of individuals who did not had Asthma have heart disease
12.66% of individuals who did had KidneyDisease have heart disease
Percentage difference in individuals with heart disease between those who had KidneyDisease and did not had KidneyDisease: 9.77%
Chi-square statistic: 6053.487763540765
P-value: 0.0000000000
There is a significant relationship between KidneyDisease and heart disease.


In [5]:
#Asthma
filtered_df = df1_temp[(df1_temp['Asthma'] == 0) | (df1_temp['Asthma'] == 1)]

proportion_heart_disease = filtered_df.groupby('Asthma')['HeartDisease'].mean()
count_heart_disease = filtered_df.groupby('Asthma')['HeartDisease'].sum()
total_count = count_heart_disease.sum()
output_sentence = f"{count_heart_disease[0] / total_count * 100:.2f}% of individuals who did not had Asthma have heart disease\n{count_heart_disease[1] / total_count * 100:.2f}% of individuals who did had Asthma have heart disease"

contingency_table = pd.crosstab([df1_temp['Asthma']], [df1_temp['HeartDisease']])
chi2, p, _, _ = chi2_contingency(contingency_table)

proportion_heart_disease = contingency_table.iloc[1, :] / contingency_table.sum(axis=0)

percentage_difference = (proportion_heart_disease[1] - proportion_heart_disease[0]) * 100
count_heart_disease = contingency_table.iloc[1, :]
print("Analysis of Asthma\n")
print(f"Count of individuals with heart disease who did not had Asthma: {count_heart_disease[0]}")
print(f"Count of individuals with heart disease who did had Asthma: {count_heart_disease[1]}")
print(output_sentence)
print(f"Percentage difference in individuals with heart disease between those who had Asthma and did not had Asthma: {percentage_difference:.2f}%")
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p:.10f}")

alpha = 0.05
if p < alpha:
    print("There is a significant relationship between Asthma and heart disease.")
else:
    print("There is no significant relationship between Asthma and heart disease.")

Analysis of Asthma

Count of individuals with heart disease who did not had Asthma: 34160
Count of individuals with heart disease who did had Asthma: 4481
82.16% of individuals who did not had Asthma have heart disease
17.84% of individuals who did had Asthma have heart disease
Percentage difference in individuals with heart disease between those who had Asthma and did not had Asthma: 4.74%
Chi-square statistic: 439.14652675300977
P-value: 0.0000000000
There is a significant relationship between Asthma and heart disease.


In [6]:
#sleeptime
print("Analysis of Sleep Time\n")
df_heart = df1_temp[['SleepTime', 'HeartDisease']].copy()

X_bmi_heart = sm.add_constant(df_heart['SleepTime'])
y_bmi_heart = df_heart['HeartDisease']

model_bmi_heart = sm.Logit(y_bmi_heart, X_bmi_heart)
result_bmi_heart = model_bmi_heart.fit()


contingency_table = pd.crosstab(df1_temp['HeartDisease'], df1_temp['SleepTime'])

# Perform chi-squared test
chi2, p, _, _ = chi2_contingency(contingency_table)

print(result_bmi_heart.summary())
print(f"Chi-squared value: {chi2}")
print(f"P-value: {p}")

if (p < 0.5):
    print("There is significant relation between Sleep Time and Heart Disease")
else:
    print("There is no significant relation between Sleep Time and Heart Disease")

Analysis of Sleep Time



Optimization terminated successfully.
         Current function value: 0.297585
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:           HeartDisease   No. Observations:               285810
Model:                          Logit   Df Residuals:                   285808
Method:                           MLE   Df Model:                            1
Date:                Mon, 04 Dec 2023   Pseudo R-squ.:               9.493e-05
Time:                        20:32:30   Log-Likelihood:                -85053.
converged:                       True   LL-Null:                       -85061.
Covariance Type:            nonrobust   LLR p-value:                 5.852e-05
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.4850      0.037    -67.505      0.000      -2.557      -2.413
SleepTime      0.0205      0.

In [7]:
#GenHealth
gen_health_categories = ['Poor', 'Fair', 'Good', 'Very good', 'Excellent']
df1_temp['GenHealth'] = pd.Categorical(df1_temp['GenHealth'], categories=gen_health_categories, ordered=True)

contingency_table = pd.crosstab(df1_temp["GenHealth"], df1_temp["HeartDisease"])
chi2, p_chi2, _, _ = chi2_contingency(contingency_table)

print("Analysis of General Health\n")
print("Chi-squared test statistic:", chi2)
print("P-value:", p_chi2)
alpha = 0.05
if p_chi2 < alpha:
    print("There is evidence of a significant association between General Health and Heart Disease.")
else:
    print("There is no significant evidence of an association between General Health and Heart Disease.")

Analysis of General Health

Chi-squared test statistic: 19049.415749486772
P-value: 0.0
There is evidence of a significant association between General Health and Heart Disease.


In [8]:
#PhysicalActivity
filtered_df = df1_temp[(df1_temp['PhysicalActivity'] == 0) | (df1_temp['PhysicalActivity'] == 1)]

proportion_heart_disease = filtered_df.groupby('PhysicalActivity')['HeartDisease'].mean()
count_heart_disease = filtered_df.groupby('PhysicalActivity')['HeartDisease'].sum()
total_count = count_heart_disease.sum()
output_sentence = f"{count_heart_disease[0] / total_count * 100:.2f}% of individuals who did not had PhysicalActivity have heart disease\n{count_heart_disease[1] / total_count * 100:.2f}% of individuals who did had PhysicalActivity have heart disease"

contingency_table = pd.crosstab([df1_temp['PhysicalActivity']], [df1_temp['HeartDisease']])
chi2, p, _, _ = chi2_contingency(contingency_table)

proportion_heart_disease = contingency_table.iloc[1, :] / contingency_table.sum(axis=0)

percentage_difference = (proportion_heart_disease[1] - proportion_heart_disease[0]) * 100
count_heart_disease = contingency_table.iloc[1, :]
print("Analysis of Physical Activity\n")
print(f"Count of individuals with heart disease who did not had PhysicalActivity: {count_heart_disease[0]}")
print(f"Count of individuals with heart disease who did had PhysicalActivity: {count_heart_disease[1]}")
print(output_sentence)
print(f"Percentage difference in individuals with heart disease between those who had PhysicalActivity and did not had PhysicalActivity: {percentage_difference:.2f}%")
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p:.10f}")

alpha = 0.05
if p < alpha:
    print("There is a significant relationship between PhysicalActivity and heart disease.")
else:
    print("There is no significant relationship between PhysicalActivity and heart disease.")

Analysis of Physical Activity

Count of individuals with heart disease who did not had PhysicalActivity: 204736
Count of individuals with heart disease who did had PhysicalActivity: 16080
35.98% of individuals who did not had PhysicalActivity have heart disease
64.02% of individuals who did had PhysicalActivity have heart disease
Percentage difference in individuals with heart disease between those who had PhysicalActivity and did not had PhysicalActivity: -14.52%
Chi-square statistic: 2747.6073186715375
P-value: 0.0000000000
There is a significant relationship between PhysicalActivity and heart disease.


In [9]:
#Race
contingency_table = pd.crosstab(df1_temp["Race"], df1_temp["HeartDisease"])
chi2, p_chi2, _, _ = chi2_contingency(contingency_table)

print("Analysis of Race\n")
print("Chi-squared test statistic:", chi2)
print("P-value:", p_chi2)

if p_chi2 < 0.05:
    print("There is evidence of a significant association between Race and Heart Disease.")
else:
    print("There is no significant evidence of an association between Race and Heart Disease.")

Analysis of Race

Chi-squared test statistic: 787.3063308412301
P-value: 6.446005992246605e-168
There is evidence of a significant association between Race and Heart Disease.


In [10]:
#Age
contingency_table = pd.crosstab(df1_temp["AgeCategory"], df1_temp["HeartDisease"])

categories = df1_temp["AgeCategory"].unique()
print("Analysis of Age")
for category in categories:
    count_heart_disease = contingency_table.loc[category, 1]
    count_total = contingency_table.loc[category, :].sum()

    count = np.array([count_heart_disease, count_total - count_heart_disease])
    nobs = np.array([count_total, count_total])

    stat, p_ztest = proportions_ztest(count, nobs, alternative='two-sided')

    print(f"\nZ-test for Proportions in Age Category {category}:")
    print("Test Statistic:", stat)
    print("P-value:", p_ztest)

    if p_ztest < 0.05:
        print(f"Age Category {category}. There is evidence of a significant difference in the proportions of heart disease.")
    else:
        print(f"Age Category {category}. There is no significant evidence of a difference in the proportions of heart disease.")

Analysis of Age

Z-test for Proportions in Age Category 65-69:
Test Statistic: -188.3008866243486
P-value: 0.0
Age Category 65-69. There is evidence of a significant difference in the proportions of heart disease.

Z-test for Proportions in Age Category 75-79:
Test Statistic: -120.05214496852876
P-value: 0.0
Age Category 75-79. There is evidence of a significant difference in the proportions of heart disease.

Z-test for Proportions in Age Category 40-44:
Test Statistic: -186.97458705355697
P-value: 0.0
Age Category 40-44. There is evidence of a significant difference in the proportions of heart disease.

Z-test for Proportions in Age Category 70-74:
Test Statistic: -161.25524984581708
P-value: 0.0
Age Category 70-74. There is evidence of a significant difference in the proportions of heart disease.

Z-test for Proportions in Age Category 80 or older:
Test Statistic: -108.39870495931285
P-value: 0.0
Age Category 80 or older. There is evidence of a significant difference in the proporti

In [11]:
#Sex
filtered_df = df1_temp[(df1_temp['Sex'] == 0) | (df1_temp['Sex'] == 1)]
print("Analysis of Gender\n")
proportion_heart_disease = filtered_df.groupby('Sex')['HeartDisease'].mean()
count_heart_disease = filtered_df.groupby('Sex')['HeartDisease'].sum()
total_count = count_heart_disease.sum()
output_sentence = f"{count_heart_disease[0] / total_count * 100:.2f}% were females having heart disease\n{count_heart_disease[1] / total_count * 100:.2f}% were males having heart disease"
print(output_sentence)

contingency_table = pd.crosstab(df1_temp["Sex"], df1_temp["HeartDisease"])
count_male_heart_disease = contingency_table.loc[1, 1]
count_female_heart_disease = contingency_table.loc[0, 1]
count_male_total = contingency_table.loc[1, :].sum()
count_female_total = contingency_table.loc[0, :].sum()

count = np.array([count_male_heart_disease, count_female_heart_disease])
nobs = np.array([count_male_total, count_female_total])

stat, p_ztest = proportions_ztest(count, nobs, alternative='two-sided')

print("\nZ-test for Proportions:")
print("Test Statistic:", stat)
print("P-value:", p_ztest)

if p_ztest < 0.05:
    print("\nThere is evidence of a significant difference in the proportions of heart disease between genders.")
else:
    print("\nThere is no significant evidence of a difference in the proportions of heart disease between genders.")


Analysis of Gender

39.35% were females having heart disease
60.65% were males having heart disease

Z-test for Proportions:
Test Statistic: 37.2576033785599
P-value: 7.980467691160784e-304

There is evidence of a significant difference in the proportions of heart disease between genders.


In [12]:
#DiffWalking
filtered_df = df1_temp[(df1_temp['DiffWalking'] == 0) | (df1_temp['DiffWalking'] == 1)]

proportion_heart_disease = filtered_df.groupby('DiffWalking')['HeartDisease'].mean()
count_heart_disease = filtered_df.groupby('DiffWalking')['HeartDisease'].sum()
total_count = count_heart_disease.sum()
output_sentence = f"{count_heart_disease[0] / total_count * 100:.2f}% of individuals who did not had DiffWalking have heart disease\n{count_heart_disease[1] / total_count * 100:.2f}% of individuals who did had DiffWalking have heart disease"

contingency_table = pd.crosstab([df1_temp['DiffWalking']], [df1_temp['HeartDisease']])
chi2, p, _, _ = chi2_contingency(contingency_table)

proportion_heart_disease = contingency_table.iloc[1, :] / contingency_table.sum(axis=0)

percentage_difference = (proportion_heart_disease[1] - proportion_heart_disease[0]) * 100
count_heart_disease = contingency_table.iloc[1, :]
print("Analysis of Difficulty in Walking\n")
print(f"Count of individuals with heart disease who did not had DiffWalking: {count_heart_disease[0]}")
print(f"Count of individuals with heart disease who did had DiffWalking: {count_heart_disease[1]}")
print(output_sentence)
print(f"Percentage difference in individuals with heart disease between those who had Difficulty in Walking and did not had Difficulty in Walking: {percentage_difference:.2f}%")
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p:.10f}")

alpha = 0.05
if p < alpha:
    print("There is a significant relationship between DiffWalking and heart disease.")
else:
    print("There is no significant relationship between DiffWalking and heart disease.")

Analysis of Difficulty in Walking

Count of individuals with heart disease who did not had DiffWalking: 31197
Count of individuals with heart disease who did had DiffWalking: 9137
63.62% of individuals who did not had DiffWalking have heart disease
36.38% of individuals who did had DiffWalking have heart disease
Percentage difference in individuals with heart disease between those who had Difficulty in Walking and did not had Difficulty in Walking: 24.41%
Chi-square statistic: 11260.122913747653
P-value: 0.0000000000
There is a significant relationship between DiffWalking and heart disease.


In [13]:
#MentalHealth
print("Analysis of Mental Health\n")
df_heart = df1_temp[['MentalHealth', 'HeartDisease']].copy()

X_bmi_heart = sm.add_constant(df_heart['MentalHealth'])
y_bmi_heart = df_heart['HeartDisease']

model_bmi_heart = sm.Logit(y_bmi_heart, X_bmi_heart)
result_bmi_heart = model_bmi_heart.fit()

print(result_bmi_heart.summary())
contingency_table = pd.crosstab(df1_temp['HeartDisease'], df1_temp['MentalHealth'])

# Perform chi-squared test
chi2, p, _, _ = chi2_contingency(contingency_table)

print(f"Chi-squared value: {chi2}")
print(f"P-value: {p}")

if (p < 0.5):
    print("There is significant relation between Mental Health and Heart Disease")
else:
    print("There is no significant relation between Mental Health and Heart Disease")

Analysis of Mental Health



Optimization terminated successfully.
         Current function value: 0.297285
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:           HeartDisease   No. Observations:               285810
Model:                          Logit   Df Residuals:                   285808
Method:                           MLE   Df Model:                            1
Date:                Mon, 04 Dec 2023   Pseudo R-squ.:                0.001105
Time:                        20:32:33   Log-Likelihood:                -84967.
converged:                       True   LL-Null:                       -85061.
Covariance Type:            nonrobust   LLR p-value:                 8.960e-43
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -2.3843      0.007   -320.750      0.000      -2.399      -2.370
MentalHealth     0.0109

In [14]:
#PhysicalHealth
print("Analysis of Physical Health\n")
df_heart = df1_temp[['PhysicalHealth', 'HeartDisease']].copy()

X_bmi_heart = sm.add_constant(df_heart['PhysicalHealth'])
y_bmi_heart = df_heart['HeartDisease']

model_bmi_heart = sm.Logit(y_bmi_heart, X_bmi_heart)
result_bmi_heart = model_bmi_heart.fit()

print(result_bmi_heart.summary())
contingency_table = pd.crosstab(df1_temp['HeartDisease'], df1_temp['PhysicalHealth'])

# Perform chi-squared test
chi2, p, _, _ = chi2_contingency(contingency_table)

print(f"Chi-squared value: {chi2}")
print(f"P-value: {p}")

if (p < 0.5):
    print("There is significant relation between Physical Health and Heart Disease")
else:
    print("There is no significant relation between Physical Health and Heart Disease")

Analysis of Physical Health

Optimization terminated successfully.
         Current function value: 0.286893
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:           HeartDisease   No. Observations:               285810
Model:                          Logit   Df Residuals:                   285808
Method:                           MLE   Df Model:                            1
Date:                Mon, 04 Dec 2023   Pseudo R-squ.:                 0.03602
Time:                        20:32:35   Log-Likelihood:                -81997.
converged:                       True   LL-Null:                       -85061.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -2.5927      0.008   -331.061      0.000      -2.608 

In [15]:
#Stroke
filtered_df = df1_temp[(df1_temp['Stroke'] == 0) | (df1_temp['Stroke'] == 1)]

proportion_heart_disease = filtered_df.groupby('Stroke')['HeartDisease'].mean()
count_heart_disease = filtered_df.groupby('Stroke')['HeartDisease'].sum()
total_count = count_heart_disease.sum()
output_sentence = f"{count_heart_disease[0] / total_count * 100:.2f}% of individuals who did not had stroke have heart disease\n{count_heart_disease[1] / total_count * 100:.2f}% of individuals who did had stroke have heart disease"

contingency_table = pd.crosstab([df1_temp['Stroke']], [df1_temp['HeartDisease']])
chi2, p, _, _ = chi2_contingency(contingency_table)

proportion_heart_disease = contingency_table.iloc[1, :] / contingency_table.sum(axis=0)

percentage_difference = (proportion_heart_disease[1] - proportion_heart_disease[0]) * 100
count_heart_disease = contingency_table.iloc[1, :]
print("Analysis of Stroke\n")
print(f"Count of individuals with heart disease who did not had stroke: {count_heart_disease[0]}")
print(f"Count of individuals with heart disease who did had stroke: {count_heart_disease[1]}")
print(output_sentence)
print(f"Percentage difference in individuals with heart disease between those who had stroke and did not had stroke: {percentage_difference:.2f}%")
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p:.10f}")

alpha = 0.05
if p < alpha:
    print("There is a significant relationship between Stroke and heart disease.")
else:
    print("There is no significant relationship between Stroke and heart disease.")

Analysis of Stroke

Count of individuals with heart disease who did not had stroke: 6842
Count of individuals with heart disease who did had stroke: 3894
84.50% of individuals who did not had stroke have heart disease
15.50% of individuals who did had stroke have heart disease
Percentage difference in individuals with heart disease between those who had stroke and did not had stroke: 12.88%
Chi-square statistic: 10506.697261789051
P-value: 0.0000000000
There is a significant relationship between Stroke and heart disease.


In [16]:
#AlcoholDrinking
filtered_df = df1_temp[(df1_temp['AlcoholDrinking'] == 0) | (df1_temp['AlcoholDrinking'] == 1)]

proportion_heart_disease = filtered_df.groupby('AlcoholDrinking')['HeartDisease'].mean()
count_heart_disease = filtered_df.groupby('AlcoholDrinking')['HeartDisease'].sum()
total_count = count_heart_disease.sum()
output_sentence = f"{count_heart_disease[0] / total_count * 100:.2f}% of individuals who do not drink have heart disease\n{count_heart_disease[1] / total_count * 100:.2f}% of individuals who drink have heart disease"

contingency_table = pd.crosstab([df1_temp['AlcoholDrinking']], [df1_temp['HeartDisease']])
chi2, p, _, _ = chi2_contingency(contingency_table)

proportion_heart_disease = contingency_table.iloc[1, :] / contingency_table.sum(axis=0)

percentage_difference = (proportion_heart_disease[1] - proportion_heart_disease[0]) * 100
count_heart_disease = contingency_table.iloc[1, :]
print("Analysis of Alcohol Drinking\n")
print(f"Count of individuals with heart disease who do not drink: {count_heart_disease[0]}")
print(f"Count of individuals with heart disease who drink: {count_heart_disease[1]}")
print(output_sentence)
print(f"Percentage difference in individuals with heart disease between those who drink and do not drink: {percentage_difference:.2f}%")
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p:.10f}")

alpha = 0.05
if p < alpha:
    print("There is a significant relationship between Alcohol Drinking and heart disease.")
else:
    print("There is no significant relationship between Alcohol Drinking and heart disease.")

Analysis of Alcohol Drinking

Count of individuals with heart disease who do not drink: 18063
Count of individuals with heart disease who drink: 1022
95.93% of individuals who do not drink have heart disease
4.07% of individuals who drink have heart disease
Percentage difference in individuals with heart disease between those who drink and do not drink: -2.86%
Chi-square statistic: 300.2789385394036
P-value: 0.0000000000
There is a significant relationship between Alcohol Drinking and heart disease.


In [17]:
#diabetes
filtered_df = df1_temp[(df1_temp['Diabetic'] == 0) | (df1_temp['Diabetic'] == 1)]

proportion_heart_disease = filtered_df.groupby('Diabetic')['HeartDisease'].mean()
count_heart_disease = filtered_df.groupby('Diabetic')['HeartDisease'].sum()
total_count = count_heart_disease.sum()
output_sentence = f"{count_heart_disease[0] / total_count * 100:.2f}% of individuals without diabetes have heart disease\n{count_heart_disease[1] / total_count * 100:.2f}% of individuals with diabetes have heart disease"

contingency_table = pd.crosstab([df1_temp['Diabetic']], [df1_temp['HeartDisease']])
chi2, p, _, _ = chi2_contingency(contingency_table)

proportion_heart_disease = contingency_table.iloc[1, :] / contingency_table.sum(axis=0)

percentage_difference = (proportion_heart_disease[1] - proportion_heart_disease[0]) * 100
count_heart_disease = contingency_table.iloc[1, :]
print("Analysis of Diabetes\n")
print(f"Count of individuals with heart disease without diabetes: {count_heart_disease[0]}")
print(f"Count of individuals with heart disease with diabetes: {count_heart_disease[1]}")
print(output_sentence)
print(f"Percentage difference in individuals with heart disease between those with and without diabetes: {percentage_difference:.2f}%")
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p:.10f}")

alpha = 0.05
if p < alpha:
    print("There is a significant relationship between diabetes and heart disease.")
else:
    print("There is no significant relationship between diabetes and heart disease.")

Analysis of Diabetes

Count of individuals with heart disease without diabetes: 32587
Count of individuals with heart disease with diabetes: 8611
65.72% of individuals without diabetes have heart disease
34.28% of individuals with diabetes have heart disease
Percentage difference in individuals with heart disease between those with and without diabetes: 21.78%
Chi-square statistic: 8809.365236991676
P-value: 0.0000000000
There is a significant relationship between diabetes and heart disease.


In [18]:
#smoke
filtered_df = df1_temp[(df1_temp['Smoking'] == 0) | (df1_temp['Smoking'] == 1)]

proportion_heart_disease = filtered_df.groupby('Smoking')['HeartDisease'].mean()
count_heart_disease = filtered_df.groupby('Smoking')['HeartDisease'].sum()
total_count = count_heart_disease.sum()
output_sentence = f"{count_heart_disease[0] / total_count * 100:.2f}% of individuals who doesn't smoke have heart disease\n{count_heart_disease[1] / total_count * 100:.2f}% of individuals who smoke have heart disease"

contingency_table = pd.crosstab([df1_temp['Smoking']],[ df1_temp['HeartDisease']])
chi2, p, _, _ = chi2_contingency(contingency_table)

proportion_heart_disease = contingency_table.iloc[1, :] / contingency_table.sum(axis=0)
percentage_difference = (proportion_heart_disease[1] - proportion_heart_disease[0]) * 100
count_heart_disease = contingency_table.iloc[1, :]
print("Analysis of Smoking\n")
print(f"Count of individuals with heart disease who doesn't smoke: {count_heart_disease[0]}")
print(f"Count of individuals with heart disease who smoke: {count_heart_disease[1]}")
print(output_sentence)
print(f"Percentage difference in individuals with heart disease between who does and doesn't smoke: {percentage_difference:.2f}%")
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p:.5f}")


alpha = 0.05
if p < alpha:
    print("There is a significant relationship between Smoking and heart disease.")
else:
    print("There is no significant relationship between Smoking and heart disease.")


Analysis of Smoking

Count of individuals with heart disease who doesn't smoke: 103698
Count of individuals with heart disease who smoke: 14632
41.75% of individuals who doesn't smoke have heart disease
58.25% of individuals who smoke have heart disease
Percentage difference in individuals with heart disease between who does and doesn't smoke: 18.48%
Chi-square statistic: 3222.5669038707038
P-value: 0.00000
There is a significant relationship between Smoking and heart disease.


In [19]:
#BMI
print("Analysis of BMI\n")
df_bmi_heart = df1_temp[['BMI', 'HeartDisease']].copy()

X_bmi_heart = sm.add_constant(df_bmi_heart['BMI'])
y_bmi_heart = df_bmi_heart['HeartDisease']

model_bmi_heart = sm.Logit(y_bmi_heart, X_bmi_heart)
result_bmi_heart = model_bmi_heart.fit()

print(result_bmi_heart.summary())
contingency_table = pd.crosstab(df1_temp['HeartDisease'], df1_temp['BMI'])

# Perform chi-squared test
chi2, p, _, _ = chi2_contingency(contingency_table)

print(f"Chi-squared value: {chi2}")
print(f"P-value: {p}")

if (p < 0.5):
    print("There is significant relation between BMI and Heart Disease")
else:
    print("There is no significant relation between BMI and Heart Disease")


Analysis of BMI

Optimization terminated successfully.
         Current function value: 0.296658
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:           HeartDisease   No. Observations:               285810
Model:                          Logit   Df Residuals:                   285808
Method:                           MLE   Df Model:                            1
Date:                Mon, 04 Dec 2023   Pseudo R-squ.:                0.003212
Time:                        20:32:38   Log-Likelihood:                -84788.
converged:                       True   LL-Null:                       -85061.
Covariance Type:            nonrobust   LLR p-value:                7.467e-121
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.0646      0.031    -98.186      0.000      -3.126      -3.003
BMI         

## Chi Test

### Smoking and Heart disease

In [20]:
heart_data = df_temp

heart_data['Diabetic'] = heart_data['Diabetic'].replace({'Yes': 1, 'Yes (during pregnancy)': 1, 'No': 0, 'No, borderline diabetes': 0})
heart_data['HeartDisease'] = heart_data['HeartDisease'].replace({'Yes': 1, 'No': 0})
heart_data['Smoking'] = heart_data['Smoking'].replace({'Yes': 1, 'No': 0})

unique_age_categories = sorted(heart_data['AgeCategory'].unique())
age_group_results = {}

for age_group in unique_age_categories:
    age_group_data = heart_data[heart_data['AgeCategory'] == age_group]
    contingency_table_age_group = pd.crosstab(age_group_data['Smoking'], age_group_data['HeartDisease'])
    chi2, p, _, _ = chi2_contingency(contingency_table_age_group)

    # Storing results
    age_group_results[age_group] = {
        'Chi-square statistic': chi2,
        'P-value': p
    }

alpha = 0.05

for age_group in unique_age_categories:
    print(f"Age Group: {age_group}")
    print(f"Chi-square statistic: {age_group_results[age_group]['Chi-square statistic']}")
    print(f"P-value: {age_group_results[age_group]['P-value']:.5f}")

    if age_group_results[age_group]['P-value'] < alpha:
        print("There is a significant relationship between Smoking and heart disease in this age group.")
    else:
        print("There is no significant relationship between Smoking and heart disease in this age group.")
    print()


Age Group: 18-24
Chi-square statistic: 4.023934528281783
P-value: 0.04486
There is a significant relationship between Smoking and heart disease in this age group.

Age Group: 25-29
Chi-square statistic: 5.6510777502426865
P-value: 0.01744
There is a significant relationship between Smoking and heart disease in this age group.

Age Group: 30-34
Chi-square statistic: 26.640443996383116
P-value: 0.00000
There is a significant relationship between Smoking and heart disease in this age group.

Age Group: 35-39
Chi-square statistic: 33.12892301213306
P-value: 0.00000
There is a significant relationship between Smoking and heart disease in this age group.

Age Group: 40-44
Chi-square statistic: 84.89057922747426
P-value: 0.00000
There is a significant relationship between Smoking and heart disease in this age group.

Age Group: 45-49
Chi-square statistic: 136.81254517779274
P-value: 0.00000
There is a significant relationship between Smoking and heart disease in this age group.

Age Group: 50

### Diabetes and Heart Disease

In [21]:
heart_data['Diabetic'] = heart_data['Diabetic'].replace(
    {'Yes': 1, 'Yes (during pregnancy)': 1, 'No': 0, 'No, borderline diabetes': 0})
heart_data['HeartDisease'] = heart_data['HeartDisease'].replace({'Yes': 1, 'No': 0})

unique_age_categories = sorted(heart_data['AgeCategory'].unique())
age_group_results_diabetes = {}

for age_group in unique_age_categories:
    age_group_data = heart_data[heart_data['AgeCategory'] == age_group]
    contingency_table_age_group = pd.crosstab(age_group_data['Diabetic'], age_group_data['HeartDisease'])
    chi2, p, _, _ = chi2_contingency(contingency_table_age_group)

    age_group_results_diabetes[age_group] = {
        'Chi-square statistic': chi2,
        'P-value': p
    }
    
alpha = 0.05

for age_group in unique_age_categories:
    print(f"Age Group: {age_group}")
    print(f"Chi-square statistic: {age_group_results_diabetes[age_group]['Chi-square statistic']}")
    print(f"P-value: {age_group_results_diabetes[age_group]['P-value']:.5f}")

    if age_group_results_diabetes[age_group]['P-value'] < alpha:
        print("There is a significant relationship between Diabetes and heart disease in this age group.")
    else:
        print("There is no significant relationship between Diabetes and heart disease in this age group.")
    print()


Age Group: 18-24
Chi-square statistic: 18.729755505557282
P-value: 0.00002
There is a significant relationship between Diabetes and heart disease in this age group.

Age Group: 25-29
Chi-square statistic: 0.0
P-value: 1.00000
There is no significant relationship between Diabetes and heart disease in this age group.

Age Group: 30-34
Chi-square statistic: 4.255973069286275
P-value: 0.03911
There is a significant relationship between Diabetes and heart disease in this age group.

Age Group: 35-39
Chi-square statistic: 45.26211861050117
P-value: 0.00000
There is a significant relationship between Diabetes and heart disease in this age group.

Age Group: 40-44
Chi-square statistic: 145.34614133192682
P-value: 0.00000
There is a significant relationship between Diabetes and heart disease in this age group.

Age Group: 45-49
Chi-square statistic: 368.3505126849182
P-value: 0.00000
There is a significant relationship between Diabetes and heart disease in this age group.

Age Group: 50-54
Chi-