In [170]:
import pandas as pd
data = pd.read_csv('/content/train.csv')
data.columns

Index(['ID', 'History of HeartDisease or Attack', 'High Blood Pressure',
       'Told High Cholesterol', 'Cholesterol Checked', 'Body Mass Index',
       'Smoked 100+ Cigarettes', 'Diagnosed Stroke', 'Diagnosed Diabetes',
       'Leisure Physical Activity', 'Heavy Alcohol Consumption',
       'Health Care Coverage', 'Doctor Visit Cost Barrier', 'General Health',
       'Difficulty Walking', 'Sex', 'Education Level', 'Income Level', 'Age',
       'Vegetable or Fruit Intake (1+ per Day)'],
      dtype='object')

In [171]:
columns_to_drop = ['Cholesterol Checked','Education Level','Health Care Coverage','Doctor Visit Cost Barrier','Income Level']
data = data.drop(columns = columns_to_drop)
data = data.dropna(subset=['History of HeartDisease or Attack'])
data = data.dropna(subset=['Smoked 100+ Cigarettes'])
data = data.dropna(subset=['Diagnosed Diabetes'])
data = data.dropna(subset=['General Health'])
data['Told High Cholesterol'].fillna('No', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Told High Cholesterol'].fillna('No', inplace=True)


In [172]:
from sklearn.preprocessing import LabelEncoder

history_Encoder = LabelEncoder()
data['History of HeartDisease or Attack']=history_Encoder.fit_transform(data['History of HeartDisease or Attack'])

blood_Encoder = LabelEncoder()
data['High Blood Pressure']=blood_Encoder.fit_transform(data['High Blood Pressure'])

cholesterol_Encoder = LabelEncoder()
data['Told High Cholesterol']=cholesterol_Encoder.fit_transform(data['Told High Cholesterol'])

smoked_Encoder = LabelEncoder()
data['Smoked 100+ Cigarettes']=smoked_Encoder.fit_transform(data['Smoked 100+ Cigarettes'])

stroke_Encoder = LabelEncoder()
data['Diagnosed Stroke']=stroke_Encoder.fit_transform(data['Diagnosed Stroke'])

diabetes_Encoder = LabelEncoder()
data['Diagnosed Diabetes']=diabetes_Encoder.fit_transform(data['Diagnosed Diabetes'])

leisure_Encoder = LabelEncoder()
data['Leisure Physical Activity']=leisure_Encoder.fit_transform(data['Leisure Physical Activity'])

alcohol_Encoder = LabelEncoder()
data['Heavy Alcohol Consumption']=alcohol_Encoder.fit_transform(data['Heavy Alcohol Consumption'])

general_Encoder = LabelEncoder()
data['General Health']=general_Encoder.fit_transform(data['General Health'])

walking_Encoder = LabelEncoder()
data['Difficulty Walking']=walking_Encoder.fit_transform(data['Difficulty Walking'])

sex_Encoder = LabelEncoder()
data['Sex']=sex_Encoder.fit_transform(data['Sex'])

vegetable_Encoder = LabelEncoder()
data['Vegetable or Fruit Intake (1+ per Day)']=vegetable_Encoder.fit_transform(data['Vegetable or Fruit Intake (1+ per Day)'])

In [173]:
from sklearn.preprocessing import StandardScaler

body = StandardScaler()
data['Body Mass Index'] = body.fit_transform(data[['Body Mass Index']])


data['Body Mass Index'].fillna(data['Body Mass Index'].mean(), inplace=True)


age = StandardScaler()
data['Age'] = age.fit_transform(data[['Age']])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Body Mass Index'].fillna(data['Body Mass Index'].mean(), inplace=True)


In [187]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score


X = data.drop(['ID','History of HeartDisease or Attack'], axis=1)
y = data['History of HeartDisease or Attack']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = LogisticRegression(random_state=42)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


model.fit(X_train_resampled, y_train_resampled)


y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))


Accuracy on test set: 75.17%
              precision    recall  f1-score   support

           0       0.98      0.75      0.85     40652
           1       0.22      0.80      0.35      3625

    accuracy                           0.75     44277
   macro avg       0.60      0.78      0.60     44277
weighted avg       0.92      0.75      0.81     44277



In [192]:
test_data = pd.read_csv('/content/test.csv')
test_data.columns

Index(['ID', 'High Blood Pressure', 'Told High Cholesterol',
       'Cholesterol Checked', 'Body Mass Index', 'Smoked 100+ Cigarettes',
       'Diagnosed Stroke', 'Diagnosed Diabetes', 'Leisure Physical Activity',
       'Heavy Alcohol Consumption', 'Health Care Coverage',
       'Doctor Visit Cost Barrier', 'General Health', 'Difficulty Walking',
       'Sex', 'Education Level', 'Income Level', 'Age',
       'Vegetable or Fruit Intake (1+ per Day)'],
      dtype='object')

In [193]:
from sklearn.preprocessing import LabelEncoder,StandardScaler

columns_to_drop = ['Cholesterol Checked','Education Level','Health Care Coverage','Doctor Visit Cost Barrier','Income Level']
test_data = test_data.drop(columns = columns_to_drop)


blood_Encoder = LabelEncoder()
test_data['High Blood Pressure']=blood_Encoder.fit_transform(test_data['High Blood Pressure'])

cholesterol_Encoder = LabelEncoder()
test_data['Told High Cholesterol']=cholesterol_Encoder.fit_transform(test_data['Told High Cholesterol'])

smoked_Encoder = LabelEncoder()
test_data['Smoked 100+ Cigarettes']=smoked_Encoder.fit_transform(test_data['Smoked 100+ Cigarettes'])

stroke_Encoder = LabelEncoder()
test_data['Diagnosed Stroke']=stroke_Encoder.fit_transform(test_data['Diagnosed Stroke'])

diabetes_Encoder = LabelEncoder()
test_data['Diagnosed Diabetes']=diabetes_Encoder.fit_transform(test_data['Diagnosed Diabetes'])

leisure_Encoder = LabelEncoder()
test_data['Leisure Physical Activity']=leisure_Encoder.fit_transform(test_data['Leisure Physical Activity'])

alcohol_Encoder = LabelEncoder()
test_data['Heavy Alcohol Consumption']=alcohol_Encoder.fit_transform(test_data['Heavy Alcohol Consumption'])

general_Encoder = LabelEncoder()
test_data['General Health']=general_Encoder.fit_transform(test_data['General Health'])

walking_Encoder = LabelEncoder()
test_data['Difficulty Walking']=walking_Encoder.fit_transform(test_data['Difficulty Walking'])

sex_Encoder = LabelEncoder()
test_data['Sex']=sex_Encoder.fit_transform(test_data['Sex'])

vegetable_Encoder = LabelEncoder()
test_data['Vegetable or Fruit Intake (1+ per Day)']=vegetable_Encoder.fit_transform(test_data['Vegetable or Fruit Intake (1+ per Day)'])


body = StandardScaler()
test_data['Body Mass Index'] = body.fit_transform(test_data[['Body Mass Index']])


test_data['Body Mass Index'].fillna(test_data['Body Mass Index'].mean(), inplace=True)


age = StandardScaler()
test_data['Age'] = age.fit_transform(test_data[['Age']])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Body Mass Index'].fillna(test_data['Body Mass Index'].mean(), inplace=True)


In [195]:
output = test_data[['ID']].copy()
X = test_data.drop(['ID'], axis=1)
pred = model.predict(X)  # ทำนายผลลัพธ์
output['History of HeartDisease or Attack'] = pred
output.to_csv('submission.csv', index=False)

In [196]:
output['History of HeartDisease or Attack'].value_counts()

Unnamed: 0_level_0,count
History of HeartDisease or Attack,Unnamed: 1_level_1
0,50865
1,23496


In [197]:
output['History of HeartDisease or Attack'] = output['History of HeartDisease or Attack'].replace({0: 'No', 1: 'Yes'})

print(output.head())


            ID History of HeartDisease or Attack
0  test_000001                               Yes
1  test_000002                                No
2  test_000003                               Yes
3  test_000004                                No
4  test_000005                                No


In [199]:
output.to_csv('submission_2.csv', index=False)