In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("data\heart_attack_prediction_dataset.csv")

In [None]:
df.columns

Index(['Patient ID', 'Age', 'Sex', 'Cholesterol', 'Blood Pressure',
       'Heart Rate', 'Diabetes', 'Family History', 'Smoking', 'Obesity',
       'Alcohol Consumption', 'Exercise Hours Per Week', 'Diet',
       'Previous Heart Problems', 'Medication Use', 'Stress Level',
       'Sedentary Hours Per Day', 'Income', 'BMI', 'Triglycerides',
       'Physical Activity Days Per Week', 'Sleep Hours Per Day', 'Country',
       'Continent', 'Hemisphere', 'Heart Attack Risk'],
      dtype='object')

In [None]:
df.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Diet,Previous Heart Problems,Medication Use,Stress Level,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk,Systolic_BP,Diastolic_BP
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,0,4.168189,Average,0,0,9,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0,158,88
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,1,1.813242,Unhealthy,1,0,1,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0,165,93
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,0,2.078353,Healthy,1,1,9,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0,174,99
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,1,9.82813,Average,1,0,9,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0,163,100
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,0,5.804299,Unhealthy,1,0,6,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0,91,88


In [53]:
import plotly.express as px
import pandas as pd
import plotly.io as pio
import os

def generate_and_save_plots(df, save_dir="plots"):
    # Ensure the save directory exists
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    data = df.copy()

    # Group the data by Hemisphere and Heart Attack Risk
    hemisphere_risk_grouped = data.groupby(['Hemisphere', 'Heart Attack Risk']).size().reset_index(name='Count')
    fig_hemisphere_stacked = px.bar(hemisphere_risk_grouped, x='Hemisphere', y='Count', color='Heart Attack Risk',
                                    title='Heart Attack Risk by Hemisphere',
                                    labels={'Count': 'Number of People', 'Heart Attack Risk': 'Heart Attack Risk (0 = No, 1 = Yes)'},
                                    barmode='stack', color_discrete_sequence=px.colors.qualitative.Set2)
    fig_hemisphere_stacked.write_image(f"{save_dir}/hemisphere_heart_attack_risk.png")
    fig_hemisphere_stacked.show()

    # Group by Continent and Heart Attack Risk
    continent_risk_grouped = data.groupby(['Continent', 'Heart Attack Risk']).size().reset_index(name='Count')
    fig_continent_stacked = px.bar(continent_risk_grouped, x='Continent', y='Count', color='Heart Attack Risk',
                                   title='Heart Attack Risk by Continent',
                                   labels={'Count': 'Number of People', 'Heart Attack Risk': 'Heart Attack Risk (0 = No, 1 = Yes)'},
                                   barmode='stack', color_discrete_sequence=px.colors.qualitative.Set1)
    fig_continent_stacked.write_image(f"{save_dir}/continent_heart_attack_risk.png")
    fig_continent_stacked.show()

    # Group by Country and Heart Attack Risk
    country_risk_grouped = data.groupby(['Country', 'Heart Attack Risk']).size().reset_index(name='Count')
    fig_stacked_bar = px.bar(country_risk_grouped, x='Country', y='Count', color='Heart Attack Risk',
                             title='Heart Attack Risk by Country',
                             labels={'Count': 'Number of People', 'Heart Attack Risk': 'Heart Attack Risk (0 = No, 1 = Yes)'},
                             barmode='stack', color_discrete_sequence=px.colors.qualitative.Pastel)
    fig_stacked_bar.write_image(f"{save_dir}/country_heart_attack_risk.png")
    fig_stacked_bar.show()

    # Heart Attack Risk by Smoking Status and Gender
    heart_risk_data = data[data['Heart Attack Risk'] == 1]
    smoking_gender_group = heart_risk_data.groupby(['Sex', 'Smoking']).size().reset_index(name='Count')
    fig_smoking_gender_grouped = px.bar(smoking_gender_group, x='Sex', y='Count', color='Smoking',
                                        barmode='group', title='Heart Attack Risk by Smoking Status and Gender',
                                        labels={'Smoking': 'Smoking Status (0 = Non-smoker, 1 = Smoker)', 'Count': 'Number of People'},
                                        color_discrete_sequence=px.colors.qualitative.Bold)
    fig_smoking_gender_grouped.write_image(f"{save_dir}/smoking_gender_heart_attack_risk.png")
    fig_smoking_gender_grouped.show()

    # Pie Chart for Average Heart Attack Risk by Gender
    gender_risk = data.groupby('Sex')['Heart Attack Risk'].mean().reset_index()
    fig_pie = px.pie(gender_risk, values='Heart Attack Risk', names='Sex', 
                     title='Average Heart Attack Risk by Gender', color_discrete_sequence=px.colors.qualitative.Set3)
    fig_pie.write_image(f"{save_dir}/gender_heart_attack_risk_pie.png")
    fig_pie.show()

    # Sunburst Chart for Heart Attack Risk by Gender
    fig_sunburst = px.sunburst(heart_risk_data, path=['Sex'], values='Heart Attack Risk', 
                               title='Heart Attack Risk by Gender', color='Sex', color_discrete_sequence=px.colors.qualitative.Pastel1)
    fig_sunburst.write_image(f"{save_dir}/gender_heart_attack_risk_sunburst.png")
    fig_sunburst.show()

    # Violin plot for Cholesterol by Heart Attack Risk
    fig_cholesterol_violin = px.violin(data, y='Cholesterol', x='Heart Attack Risk', color='Heart Attack Risk',
                                       title='Cholesterol Levels by Heart Attack Risk',
                                       box=True, points="all", color_discrete_sequence=px.colors.qualitative.Set1)
    fig_cholesterol_violin.write_image(f"{save_dir}/cholesterol_heart_attack_risk_violin.png")
    fig_cholesterol_violin.show()

    # Age distribution histogram for Heart Attack Risk = 1
    fig_age_dist = px.histogram(heart_risk_data, x='Age', nbins=20, title='Age Distribution for Heart Attack Risk = 1',
                                color_discrete_sequence=px.colors.qualitative.Vivid)
    fig_age_dist.write_image(f"{save_dir}/age_distribution_heart_attack_risk.png")
    fig_age_dist.show()

    # Systolic and Diastolic Blood Pressure by Age Group and Heart Attack Risk
    bp_split = data['Blood Pressure'].str.split('/', expand=True)
    bp_split.columns = ['Systolic_BP', 'Diastolic_BP']
    data['Systolic_BP'] = pd.to_numeric(bp_split['Systolic_BP'], errors='coerce')
    data['Diastolic_BP'] = pd.to_numeric(bp_split['Diastolic_BP'], errors='coerce')
    data['Age Group'] = pd.cut(data['Age'], bins=[0, 30, 45, 60, 100], labels=['<30', '30-45', '45-60', '60+'])

    bp_grouped = data.groupby(['Age Group', 'Heart Attack Risk']).agg({
        'Systolic_BP': 'mean',
        'Diastolic_BP': 'mean'
    }).reset_index()

    # Systolic Blood Pressure
    fig_systolic_stacked = px.bar(bp_grouped, x='Age Group', y='Systolic_BP', color='Heart Attack Risk',
                                  title='Systolic Blood Pressure by Age Group and Heart Attack Risk',
                                  labels={'Systolic_BP': 'Mean Systolic BP', 'Heart Attack Risk': 'Heart Attack Risk (0 = No, 1 = Yes)'},
                                  barmode='stack', color_discrete_sequence=px.colors.qualitative.Set1)
    fig_systolic_stacked.write_image(f"{save_dir}/systolic_bp_heart_attack_risk.png")
    fig_systolic_stacked.show()

    # Diastolic Blood Pressure
    fig_diastolic_stacked = px.bar(bp_grouped, x='Age Group', y='Diastolic_BP', color='Heart Attack Risk',
                                   title='Diastolic Blood Pressure by Age Group and Heart Attack Risk',
                                   labels={'Diastolic_BP': 'Mean Diastolic BP', 'Heart Attack Risk': 'Heart Attack Risk (0 = No, 1 = Yes)'},
                                   barmode='stack', color_discrete_sequence=px.colors.qualitative.Set2)
    fig_diastolic_stacked.write_image(f"{save_dir}/diastolic_bp_heart_attack_risk.png")
    fig_diastolic_stacked.show()

# Call the function by passing the dataframe `df`
# Example usage:
# generate_and_save_plots(df)


In [21]:
generate_and_save_plots(df)

In [13]:
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

# 1. Summary statistics with additional metrics
numerical_cols = ['Age', 'Cholesterol', 'Heart Rate', 'Diabetes', 'Family History', 'Smoking', 
                  'Obesity', 'Alcohol Consumption', 'Exercise Hours Per Week', 'Previous Heart Problems', 
                  'Medication Use', 'Stress Level', 'Sedentary Hours Per Day', 'Income', 'BMI', 
                  'Triglycerides', 'Physical Activity Days Per Week', 'Sleep Hours Per Day', 'Heart Attack Risk']
data =df.copy()

# Group the data by Hemisphere and Heart Attack Risk, and count occurrences
hemisphere_risk_grouped = data.groupby(['Hemisphere', 'Heart Attack Risk']).size().reset_index(name='Count')

# Create a stacked bar chart for heart attack risk by hemisphere
fig_hemisphere_stacked = px.bar(hemisphere_risk_grouped, x='Hemisphere', y='Count', color='Heart Attack Risk',
                                title='Heart Attack Risk by Hemisphere (Stacked)',
                                labels={'Count': 'Number of People', 'Heart Attack Risk': 'Heart Attack Risk (0 = No, 1 = Yes)'},
                                barmode='stack',
                                color_discrete_sequence=px.colors.qualitative.Set2)

# Show the stacked bar chart
fig_hemisphere_stacked.show()

# Group the data by Continent and Heart Attack Risk, and count occurrences
continent_risk_grouped = data.groupby(['Continent', 'Heart Attack Risk']).size().reset_index(name='Count')

# Create a stacked bar chart for heart attack risk by continent
fig_continent_stacked = px.bar(continent_risk_grouped, x='Continent', y='Count', color='Heart Attack Risk',
                               title='Heart Attack Risk by Continent (Stacked)',
                               labels={'Count': 'Number of People', 'Heart Attack Risk': 'Heart Attack Risk (0 = No, 1 = Yes)'},
                               barmode='stack',
                               color_discrete_sequence=px.colors.qualitative.Set1)

# Show the stacked bar chart
fig_continent_stacked.show()




In [None]:
import plotly.express as px

# Group the data by Country and Heart Attack Risk, and calculate the count of each risk level
country_risk_grouped = data.groupby(['Country', 'Heart Attack Risk']).size().reset_index(name='Count')

# Create a stacked bar chart for heart attack risk by country
fig_stacked_bar = px.bar(country_risk_grouped, x='Country', y='Count', color='Heart Attack Risk',
                         title='Heart Attack Risk by Country',
                         labels={'Count': 'Number of People', 'Heart Attack Risk': 'Heart Attack Risk (0 = No, 1 = Yes)'},
                         barmode='stack',
                         color_discrete_sequence=px.colors.qualitative.Pastel)

# Update layout for better visibility
fig_stacked_bar.update_layout(
    xaxis_title="Country",
    yaxis_title="Count of People with Heart Attack Risk",
    legend_title="Heart Attack Risk",
    xaxis={'categoryorder': 'total descending'},  
)

# Show the stacked bar chart
fig_stacked_bar.show()


In [73]:
# Group the data by Country and Heart Attack Risk, and calculate the count for each risk level
country_risk_grouped = data.groupby(['Country', 'Heart Attack Risk']).size().reset_index(name='Count')

# Calculate the total number of people in each country
total_by_country = data.groupby('Country').size().reset_index(name='Total')

# Merge the count data with the total data
country_risk_grouped = country_risk_grouped.merge(total_by_country, on='Country')

# Calculate the percentage of heart attack risk for each country
country_risk_grouped['Percentage'] = (country_risk_grouped['Count'] / country_risk_grouped['Total']) * 100

country_risk_grouped = country_risk_grouped.sort_values(by='Percentage', ascending=False)

# Display the country-wise percentage of heart attack risk
print(country_risk_grouped[country_risk_grouped['Heart Attack Risk']==1])


       Country      Heart Attack Risk  Count  Total  Percentage
29     South Korea          1           163    409    39.853301
25         Nigeria          1           178    448    39.732143
37   United States          1           166    420    39.523810
11        Colombia          1           162    429    37.762238
33        Thailand          1           161    428    37.616822
3        Australia          1           168    449    37.416481
1        Argentina          1           174    471    36.942675
15         Germany          1           172    477    36.058700
7           Canada          1           158    440    35.909091
9            China          1           155    436    35.550459
5           Brazil          1           163    462    35.281385
13          France          1           157    446    35.201794
35  United Kingdom          1           160    457    35.010941
31           Spain          1           150    430    34.883721
39         Vietnam          1           

In [74]:
# Group the data by Continent and Heart Attack Risk, and calculate the count for each risk level
continent_risk_grouped = data.groupby(['Continent', 'Heart Attack Risk']).size().reset_index(name='Count')

# Calculate the total number of people in each continent
total_by_continent = data.groupby('Continent').size().reset_index(name='Total')

# Merge the count data with the total data
continent_risk_grouped = continent_risk_grouped.merge(total_by_continent, on='Continent')

# Calculate the percentage of heart attack risk for each continent
continent_risk_grouped['Percentage'] = (continent_risk_grouped['Count'] / continent_risk_grouped['Total']) * 100

continent_risk_grouped = continent_risk_grouped.sort_values(by='Percentage', ascending=False)

# Display the continent-wise percentage of heart attack risk
print(continent_risk_grouped[continent_risk_grouped['Heart Attack Risk']==1])


     Continent     Heart Attack Risk  Count  Total  Percentage
9   North America          1           324    860    37.674419
1          Africa          1           322    873    36.884307
11  South America          1           499   1362    36.637298
5       Australia          1           319    884    36.085973
3            Asia          1           900   2543    35.391270
7          Europe          1           775   2241    34.582776


In [61]:
pd.DataFrame(df['Heart Attack Risk'].value_counts())

Unnamed: 0_level_0,count
Heart Attack Risk,Unnamed: 1_level_1
0,5624
1,3139


In [62]:
pd.DataFrame(data.groupby(['Sex','Heart Attack Risk'])['BMI'].count())

Unnamed: 0_level_0,Unnamed: 1_level_0,BMI
Sex,Heart Attack Risk,Unnamed: 2_level_1
Female,0,1708
Female,1,944
Male,0,3916
Male,1,2195


In [55]:
pd.DataFrame(data.groupby(['Sex','Heart Attack Risk'])['BMI'].mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,BMI
Sex,Heart Attack Risk,Unnamed: 2_level_1
Female,0,28.792706
Female,1,29.151495
Male,0,28.934375
Male,1,28.779852


In [60]:
pd.DataFrame(data.groupby(['Sex','Heart Attack Risk'])['Systolic_BP'].mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,Systolic_BP
Sex,Heart Attack Risk,Unnamed: 2_level_1
Female,0,134.295667
Female,1,137.165254
Male,0,134.890449
Male,1,135.114351


In [63]:
pd.DataFrame(data.groupby(['Sex','Heart Attack Risk'])['Cholesterol'].mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,Cholesterol
Sex,Heart Attack Risk,Unnamed: 2_level_1
Female,0,256.488876
Female,1,263.382415
Male,0,259.677222
Male,1,261.363098


In [64]:
# 90th percentile of Cholesterol
pd.DataFrame(data.groupby(['Sex', 'Heart Attack Risk'])['Cholesterol'].quantile(0.60))

Unnamed: 0_level_0,Unnamed: 1_level_0,Cholesterol
Sex,Heart Attack Risk,Unnamed: 2_level_1
Female,0,282.0
Female,1,289.0
Male,0,288.0
Male,1,289.0


In [65]:
pd.DataFrame(data.groupby(['Sex','Heart Attack Risk'])['Sleep Hours Per Day'].mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,Sleep Hours Per Day
Sex,Heart Attack Risk,Unnamed: 2_level_1
Female,0,7.059719
Female,1,7.003178
Male,0,7.047242
Male,1,6.961731


In [66]:

pd.DataFrame(data.groupby(['Sex','Heart Attack Risk'])['Physical Activity Days Per Week'].mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,Physical Activity Days Per Week
Sex,Heart Attack Risk,Unnamed: 2_level_1
Female,0,3.556206
Female,1,3.443856
Male,0,3.472932
Male,1,3.487472


In [67]:

pd.DataFrame(data.groupby(['Sex','Heart Attack Risk'])['Sedentary Hours Per Day'].mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,Sedentary Hours Per Day
Sex,Heart Attack Risk,Unnamed: 2_level_1
Female,0,6.001238
Female,1,5.935768
Male,0,6.011273
Male,1,5.981359


In [68]:
pd.DataFrame(data.groupby(['Sex','Heart Attack Risk'])['Income'].mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,Income
Sex,Heart Attack Risk,Unnamed: 2_level_1
Female,0,156029.522248
Female,1,161390.688559
Male,0,158405.131767
Male,1,158402.97631


In [69]:
import plotly.express as px
import pandas as pd

# Filter data for individuals with Heart Attack Risk = 1
heart_risk_data = data[data['Heart Attack Risk'] == 1]

# Create a new DataFrame to group by Sex and Smoking, then calculate the weighted count
smoking_gender_group = heart_risk_data.groupby(['Sex', 'Smoking']).size().reset_index(name='Count')

# Create a grouped bar chart with a more distinct color palette
fig_smoking_gender_grouped = px.bar(smoking_gender_group, x='Sex', y='Count', color='Smoking',
                                    barmode='group', 
                                    title='Heart Attack Risk by Smoking Status and Gender',
                                    labels={'Smoking': 'Smoking Status (0 = Non-smoker, 1 = Smoker)', 
                                            'Count': 'Number of People'},
                                    color_discrete_sequence=px.colors.qualitative.Bold)

# Update layout for better visualization
fig_smoking_gender_grouped.update_layout(
    xaxis_title="Gender",
    yaxis_title="Count of People with Heart Attack Risk",
    legend_title="Smoking Status"
)

fig_smoking_gender_grouped.show()


In [59]:
    bp_split = data['Blood Pressure'].str.split('/', expand=True)
    bp_split.columns = ['Systolic_BP', 'Diastolic_BP']
    data['Systolic_BP'] = pd.to_numeric(bp_split['Systolic_BP'], errors='coerce')
    data['Diastolic_BP'] = pd.to_numeric(bp_split['Diastolic_BP'], errors='coerce')

In [71]:
import plotly.express as px

# Group by Sex and Heart Attack Risk to get average heart attack risk by gender
gender_risk = data.groupby('Sex')['Heart Attack Risk'].mean().reset_index()

# Pie Chart
fig_pie = px.pie(gender_risk, values='Heart Attack Risk', names='Sex', 
                 title='Average Heart Attack Risk by Gender (Pie Chart)',
                 color_discrete_sequence=px.colors.qualitative.Set3)  # Interesting colors
fig_pie.show()

heart_risk_1 = data[data['Heart Attack Risk'] == 1]
# Alternative Sunburst Chart
fig_sunburst = px.sunburst(heart_risk_1, path=['Sex'], values='Heart Attack Risk', 
                           title='Heart Attack Risk by Gender (Sunburst)',
                           color='Sex', color_discrete_sequence=px.colors.qualitative.Pastel1)
fig_sunburst.show()


In [52]:
# Violin plot for Cholesterol by Heart Attack Risk
fig_cholesterol_violin = px.violin(data, y='Cholesterol', x='Heart Attack Risk', color='Heart Attack Risk',
                                   title='Cholesterol Levels by Heart Attack Risk',
                                   box=True, points="all", color_discrete_sequence=px.colors.qualitative.Set1)
fig_cholesterol_violin.show()



In [53]:
# Filter data for Heart Attack Risk = 1
heart_risk_1 = data[data['Heart Attack Risk'] == 1]

# Plot histogram of age distribution
fig_age_dist = px.histogram(heart_risk_1, x='Age', nbins=20, title='Age Distribution for Heart Attack Risk = 1',
                            color_discrete_sequence=px.colors.qualitative.Vivid)
fig_age_dist.show()


In [89]:
import plotly.express as px
import pandas as pd

# Recreate the age group bins
data['Age Group'] = pd.cut(data['Age'], bins=[0, 30, 45, 60, 100], labels=['<30', '30-45', '45-60', '60+'])

# Group the data by Age Group, Heart Attack Risk, and calculate the mean of systolic and diastolic BP
bp_grouped = data.groupby(['Age Group', 'Heart Attack Risk']).agg({
    'Systolic_BP': 'mean',
    'Diastolic_BP': 'mean'
}).reset_index()

# 1. Stacked Bar Chart for Systolic Blood Pressure by Age Group and Heart Attack Risk
fig_systolic_stacked = px.bar(bp_grouped, x='Age Group', y='Systolic_BP', color='Heart Attack Risk',
                              title='Systolic Blood Pressure by Age Group and Heart Attack Risk',
                              labels={'Systolic_BP': 'Mean Systolic BP', 'Heart Attack Risk': 'Heart Attack Risk (0 = No, 1 = Yes)'},
                              barmode='stack',
                              color_discrete_sequence=px.colors.qualitative.Set1)

# Show the systolic BP stacked chart
fig_systolic_stacked.show()

# 2. Stacked Bar Chart for Diastolic Blood Pressure by Age Group and Heart Attack Risk
fig_diastolic_stacked = px.bar(bp_grouped, x='Age Group', y='Diastolic_BP', color='Heart Attack Risk',
                               title='Diastolic Blood Pressure by Age Group and Heart Attack Risk',
                               labels={'Diastolic_BP': 'Mean Diastolic BP', 'Heart Attack Risk': 'Heart Attack Risk (0 = No, 1 = Yes)'},
                               barmode='stack',
                               color_discrete_sequence=px.colors.qualitative.Set2)

# Show the diastolic BP stacked chart
fig_diastolic_stacked.show()






In [44]:
# eda.py

import pandas as pd
import plotly.express as px
import plotly.io as pio

def create_summary_table(df):
    """
    Create a detailed summary table with additional statistical metrics.
    """
    df = df.select_dtypes(include=['float64', 'int64', 'float32', 'int32'])
    summary = df.describe(include='all').transpose()
    # Additional statistical metrics with rounding to 1 decimal place
    summary['skewness'] = df.skew().round(1)
    summary['kurtosis'] = df.kurtosis().round(1)
    summary['missing_values'] = df.isnull().sum()
    summary['unique_values'] = df.nunique()

    cols_to_int = summary.columns.difference(['skewness', 'kurtosis'])
    summary[cols_to_int] = summary[cols_to_int].astype(int)
    
    return summary



In [50]:
# main.py

import os
import numpy as np

# Define file paths
data_filepath = 'data/heart_attack_prediction_dataset.csv'
output_folder = 'output'

    # Create output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

    # Load the dataset
df = pd.read_csv(data_filepath)

    # Create summary table
summary = create_summary_table(df)
summary.rename(columns={"skewness": "skew","missing_values": "missing", "unique_values": "unique" }, inplace=True)
summary.reset_index(names="Columns", inplace=True)
summary.Columns = summary.Columns.str.replace(" Per ", "/")
summary.Columns = summary.Columns.str.replace("Physical Activity", "Excercise-")
summary.to_csv(f"{output_folder}/summary_table.csv")
print("Summary table saved.")


Summary table saved.


In [51]:
summary

Unnamed: 0,Columns,count,mean,std,min,25%,50%,75%,max,skew,kurtosis,missing,unique
0,Age,8763,53,21,18,35,54,72,90,0.0,-1.2,0,73
1,Cholesterol,8763,259,80,120,192,259,330,400,-0.0,-1.2,0,281
2,Heart Rate,8763,75,20,40,57,75,93,110,-0.0,-1.2,0,71
3,Diabetes,8763,0,0,0,0,1,1,1,-0.6,-1.6,0,2
4,Family History,8763,0,0,0,0,0,1,1,0.0,-2.0,0,2
5,Smoking,8763,0,0,0,1,1,1,1,-2.6,4.8,0,2
6,Obesity,8763,0,0,0,0,1,1,1,-0.0,-2.0,0,2
7,Alcohol Consumption,8763,0,0,0,0,1,1,1,-0.4,-1.8,0,2
8,Exercise Hours/Week,8763,10,5,0,4,10,15,19,-0.0,-1.2,0,8763
9,Previous Heart Problems,8763,0,0,0,0,0,1,1,0.0,-2.0,0,2


In [None]:
df

In [86]:
df_an['Entity'].value_counts()

Entity
Canada                              73
Netherlands                         73
Australia                           72
Japan                               72
Iceland                             72
United States                       72
Sweden                              72
France                              71
United Kingdom                      71
Spain                               71
Ireland                             71
Switzerland                         71
Denmark                             71
Italy                               70
Finland                             70
Hungary                             68
New Zealand                         67
Mexico                              67
Belgium                             67
Austria                             67
Chile                               67
Norway                              66
Hong Kong                           63
Portugal                            62
Mauritius                           62
Poland            