In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cardiovascular-disease-dataset/cardio_train.csv


In [2]:
#Load the dataset
data = pd.read_csv('/kaggle/input/cardiovascular-disease-dataset/cardio_train.csv', sep=';')

In [3]:
data.head()


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
data = data.loc[:, ['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']]



In [5]:
# Convert age from days to years
data['age'] = data['age'] // 365.25  # Assuming a year is 365.25 days for leap years


In [6]:
data.head()


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50.0,2,168,62.0,110,80,1,1,0,0,1,0
1,1,55.0,1,156,85.0,140,90,3,1,0,0,1,1
2,2,51.0,1,165,64.0,130,70,3,1,0,0,0,1
3,3,48.0,2,169,82.0,150,100,1,1,0,0,1,1
4,4,47.0,1,156,56.0,100,60,1,1,0,0,0,0


In [7]:

# Assuming the column representing age is 'age' and cholesterol is 'cholesterol'
age_threshold = 50

# Create a new column indicating whether the person is over 50
data['over_50'] = data['age'] > age_threshold

# Calculate the average cholesterol for people over 50 and under 50
average_cholesterol_over_50 = data[data['over_50']]['cholesterol'].mean()
average_cholesterol_under_50 = data[~data['over_50']]['cholesterol'].mean()

print(f"The average cholesterol for people over 50 is {average_cholesterol_over_50:.2f}")
print(f"The average cholesterol for people under 50 is {average_cholesterol_under_50:.2f}")

# Perform a t-test or another statistical test if needed


The average cholesterol for people over 50 is 1.43
The average cholesterol for people under 50 is 1.25


In [8]:
# Calculate smoking rates for men and women
smoking_rates = data.groupby('gender')['smoke'].mean()

# Print the smoking rates for men and women
print(f"Smoking rate for men: {smoking_rates[1]:.2%}")
print(f"Smoking rate for women: {smoking_rates[2]:.2%}")

# Compare smoking rates
if smoking_rates[1] > smoking_rates[2]:
    print("Men are more likely to be smokers than women.")
elif smoking_rates[1] < smoking_rates[2]:
    print("Women are more likely to be smokers than men.")
else:
    print("The smoking rates for men and women are the same.")

Smoking rate for men: 1.79%
Smoking rate for women: 21.89%
Women are more likely to be smokers than men.


In [9]:
# Calculate the height of the tallest 1% of people
tallest_1_percent_height = data['height'].quantile(0.99)

print(f"The height of the tallest 1% of people is {tallest_1_percent_height:.2f} cm.")

The height of the tallest 1% of people is 184.00 cm.


In [10]:
#we can remove the person's Id from given index
data.drop(columns='id',inplace=True)

In [11]:
data.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,over_50
0,50.0,2,168,62.0,110,80,1,1,0,0,1,0,False
1,55.0,1,156,85.0,140,90,3,1,0,0,1,1,True
2,51.0,1,165,64.0,130,70,3,1,0,0,0,1,True
3,48.0,2,169,82.0,150,100,1,1,0,0,1,1,False
4,47.0,1,156,56.0,100,60,1,1,0,0,0,0,False


In [12]:
# Calculate the Spearman rank correlation matrix
spearman_corr_matrix = data.corr(method='spearman')

# Find the pair of features with the highest Spearman rank correlation
max_corr = spearman_corr_matrix.abs().stack().idxmax()
feature1, feature2 = max_corr

# Get the correlation coefficient
correlation_coefficient = spearman_corr_matrix.loc[feature1, feature2]

print(f"The two features with the highest Spearman rank correlation are: {feature1} and {feature2}")
print(f"The correlation coefficient is: {correlation_coefficient:.4f}")

The two features with the highest Spearman rank correlation are: age and age
The correlation coefficient is: 1.0000


In [13]:
# Calculate the z-scores for height
data['height_zscore'] = (data['height'] - data['height'].mean()) / data['height'].std()

# Identify ages more than 2 standard deviations away from the average height
outliers_percentage = (data['height_zscore'].abs() > 2).mean() * 100

print(f"The percentage of ages more than 2 standard deviations away from the average height is: {outliers_percentage:.2f}%")

The percentage of ages more than 2 standard deviations away from the average height is: 3.34%


In [14]:
# Filter individuals over 50 years old
over_50 = data[data['age'] > 50]

# Calculate the percentage of individuals over 50 who consume alcohol
alcohol_percentage_over_50 = (over_50['alco'] == 1).mean() * 100

print(f"The percentage of individuals over 50 years old who consume alcohol is: {alcohol_percentage_over_50:.2f}%")


The percentage of individuals over 50 years old who consume alcohol is: 4.95%
