# Import Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor  # Example model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


# Load dataset

In [2]:
df = pd.read_csv("heart_attack_risk_dataset.csv")

# Display basic info

In [3]:
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      50000 non-null  int64  
 1   Gender                   50000 non-null  object 
 2   Smoking                  49997 non-null  float64
 3   Alcohol_Consumption      50000 non-null  int64  
 4   Physical_Activity_Level  49997 non-null  object 
 5   BMI                      50000 non-null  float64
 6   Diabetes                 50000 non-null  int64  
 7   Hypertension             50000 non-null  int64  
 8   Cholesterol_Level        49999 non-null  float64
 9   Resting_BP               50000 non-null  int64  
 10  Heart_Rate               49996 non-null  float64
 11  Family_History           50000 non-null  int64  
 12  Stress_Level             50000 non-null  object 
 13  Chest_Pain_Type          50000 non-null  object 
 14  Thalassemia           

# Check for missing values

In [4]:
print(df.isnull().sum())

Age                        0
Gender                     0
Smoking                    3
Alcohol_Consumption        0
Physical_Activity_Level    3
BMI                        0
Diabetes                   0
Hypertension               0
Cholesterol_Level          1
Resting_BP                 0
Heart_Rate                 4
Family_History             0
Stress_Level               0
Chest_Pain_Type            0
Thalassemia                0
Fasting_Blood_Sugar        2
ECG_Results                0
Exercise_Induced_Angina    2
Max_Heart_Rate_Achieved    0
Heart_Attack_Risk          0
dtype: int64


# Data Cleaning

In [8]:
# for loop orqali 4 line coding bn osongina tushib qolgan qiymatlarni filling qilamiz
for col in df.columns:
    if df[col].dtype==object:
     df[col].fillna(df[col].mode()[0],inplace=True)
    else:   
      df[col].fillna(df[col].mean(),inplace=True)    

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0],inplace=True)


In [None]:
print(df.isnull().sum()) # ham filling qildik tushib qolgan qiymatlarni

Age                        0
Gender                     0
Smoking                    0
Alcohol_Consumption        0
Physical_Activity_Level    0
BMI                        0
Diabetes                   0
Hypertension               0
Cholesterol_Level          0
Resting_BP                 0
Heart_Rate                 0
Family_History             0
Stress_Level               0
Chest_Pain_Type            0
Thalassemia                0
Fasting_Blood_Sugar        0
ECG_Results                0
Exercise_Induced_Angina    0
Max_Heart_Rate_Achieved    0
Heart_Attack_Risk          0
dtype: int64


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      50000 non-null  int64  
 1   Gender                   50000 non-null  object 
 2   Smoking                  50000 non-null  float64
 3   Alcohol_Consumption      50000 non-null  int64  
 4   Physical_Activity_Level  50000 non-null  object 
 5   BMI                      50000 non-null  float64
 6   Diabetes                 50000 non-null  int64  
 7   Hypertension             50000 non-null  int64  
 8   Cholesterol_Level        50000 non-null  float64
 9   Resting_BP               50000 non-null  int64  
 10  Heart_Rate               50000 non-null  float64
 11  Family_History           50000 non-null  int64  
 12  Stress_Level             50000 non-null  object 
 13  Chest_Pain_Type          50000 non-null  object 
 14  Thalassemia           

# Encoding

In [11]:
df_encoded=df.copy()

In [22]:
cardinality=df.nunique()

In [24]:
cardinality

Age                          72
Gender                        2
Smoking                       3
Alcohol_Consumption           2
Physical_Activity_Level       3
BMI                        2501
Diabetes                      2
Hypertension                  2
Cholesterol_Level          1502
Resting_BP                   90
Heart_Rate                   71
Family_History                2
Stress_Level                  3
Chest_Pain_Type               4
Thalassemia                   3
Fasting_Blood_Sugar           3
ECG_Results                   3
Exercise_Induced_Angina       3
Max_Heart_Rate_Achieved     100
Heart_Attack_Risk             3
dtype: int64

In [12]:
from sklearn.preprocessing import LabelEncoder  
for col in df_encoded.select_dtypes(include=['object']).columns:
    cardinality=df_encoded[col].nunique()
    if cardinality<=5:
        le=LabelEncoder()
        df_encoded[col]=le.fit_transform(df_encoded[col])
    else:
        df_encoded=pd.get_dummies(df_encoded,columns=[col],dtype=int, drop_first=True)
df=df_encoded

In [None]:
df.info()   # hamma data type int va float ga o'zgartirib oldik

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      50000 non-null  int64  
 1   Gender                   50000 non-null  int64  
 2   Smoking                  50000 non-null  float64
 3   Alcohol_Consumption      50000 non-null  int64  
 4   Physical_Activity_Level  50000 non-null  int64  
 5   BMI                      50000 non-null  float64
 6   Diabetes                 50000 non-null  int64  
 7   Hypertension             50000 non-null  int64  
 8   Cholesterol_Level        50000 non-null  float64
 9   Resting_BP               50000 non-null  int64  
 10  Heart_Rate               50000 non-null  float64
 11  Family_History           50000 non-null  int64  
 12  Stress_Level             50000 non-null  int64  
 13  Chest_Pain_Type          50000 non-null  int64  
 14  Thalassemia           

# Scaling

In [14]:
num_col=df.select_dtypes(include=['int64','float64']).columns

In [15]:
num_col

Index(['Age', 'Gender', 'Smoking', 'Alcohol_Consumption',
       'Physical_Activity_Level', 'BMI', 'Diabetes', 'Hypertension',
       'Cholesterol_Level', 'Resting_BP', 'Heart_Rate', 'Family_History',
       'Stress_Level', 'Chest_Pain_Type', 'Thalassemia', 'Fasting_Blood_Sugar',
       'ECG_Results', 'Exercise_Induced_Angina', 'Max_Heart_Rate_Achieved',
       'Heart_Attack_Risk'],
      dtype='object')

In [25]:
scaler=StandardScaler()
df[num_col]=scaler.fit_transform(df[num_col])

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      50000 non-null  int64  
 1   Gender                   50000 non-null  int64  
 2   Smoking                  50000 non-null  float64
 3   Alcohol_Consumption      50000 non-null  int64  
 4   Physical_Activity_Level  50000 non-null  int64  
 5   BMI                      50000 non-null  float64
 6   Diabetes                 50000 non-null  int64  
 7   Hypertension             50000 non-null  int64  
 8   Cholesterol_Level        50000 non-null  float64
 9   Resting_BP               50000 non-null  int64  
 10  Heart_Rate               50000 non-null  float64
 11  Family_History           50000 non-null  int64  
 12  Stress_Level             50000 non-null  int64  
 13  Chest_Pain_Type          50000 non-null  int64  
 14  Thalassemia           

In [26]:
df.head()

Unnamed: 0,Age,Gender,Smoking,Alcohol_Consumption,Physical_Activity_Level,BMI,Diabetes,Hypertension,Cholesterol_Level,Resting_BP,Heart_Rate,Family_History,Stress_Level,Chest_Pain_Type,Thalassemia,Fasting_Blood_Sugar,ECG_Results,Exercise_Induced_Angina,Max_Heart_Rate_Achieved,Heart_Attack_Risk
0,0.750106,-0.996566,1.533418,-0.819869,1.069353,0.98139,2.007279,-0.650572,-1.683829,1.411391,-0.472052,-0.655184,0.89697,0.439608,1.223253,-0.419047,-0.001792,-0.502553,-1.224884,-0.138032
1,-1.028843,1.003446,-0.652177,-0.819869,1.069353,-0.660109,-0.498187,-0.650572,-1.344465,-0.325544,0.419666,-0.655184,-0.386984,-1.34858,-0.003902,-0.419047,1.225394,-0.502553,0.822063,1.290276
2,1.7117,1.003446,-0.652177,1.219707,1.069353,1.079658,-0.498187,-0.650572,1.091101,-0.44134,1.608622,-0.655184,-0.386984,1.333703,1.223253,-0.419047,1.225394,-0.502553,-1.398354,-0.138032
3,1.182823,1.003446,-0.652177,1.219707,1.069353,-1.285706,2.007279,-0.650572,0.292328,0.36923,1.509543,-0.655184,-0.386984,1.333703,-1.231058,2.38646,-1.228977,-0.502553,-0.704474,-0.138032
4,-0.740365,-0.996566,1.533418,-0.819869,1.069353,-1.06564,-0.498187,-0.650572,-0.40025,-0.44134,0.617825,-0.655184,-1.670938,-1.34858,1.223253,-0.419047,1.225394,-0.502553,-0.877944,1.290276


In [29]:
x=df.drop(columns=['Heart_Attack_Risk'])   

In [30]:
y=df['Heart_Attack_Risk']

In [32]:
x_train,x_temp,y_train,y_temp=train_test_split(x,y,test_size=0.2,random_state=42)    # Datani qismlarga ajratish
x_test,x_val,y_test,y_val=train_test_split(x_temp,y_temp,test_size=0.5,random_state=42)

In [33]:
model=LinearRegression()
linear_model=model.fit(x_train,y_train)
y_pred=linear_model.predict(x_test)
mse=mean_squared_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)

In [None]:
mse # qancha past chiqsa model yaxshi ishlaydi

0.9718172533922849

In [None]:
r2  # qancha yuqori chiqsa modelni past baxolaydi

-0.001446422977203321

In [36]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)  # 5-Fold CV

# Perform Cross-Validation
cv_scores = cross_val_score(linear_model, x, y, cv=kf, scoring='neg_mean_squared_error')
cv_scores = np.sqrt(-cv_scores)

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))

Cross-Validation Scores: [0.98849912 0.99780385 1.00215941 1.00051529 1.01550324 0.99670645
 0.99624858 1.01315905 0.98648877 1.00431179]
Mean CV Score: 1.0001395549471854


In [37]:
print(np.mean(cv_scores)-mse)

0.028322301554900564
