In [200]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

In [201]:
col_names = ['id', 'Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS', 'RestingECG',
            'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope', 'HeartDisease']
data = pd.read_csv("heart_train.csv", header=0, skiprows=1, names=col_names, low_memory=False)

In [202]:
target_column = 'HeartDisease'

# Count the occurrences of each class
class_counts = data[target_column].value_counts()

for target_value, count in class_counts.items():
    print(f"Target value {target_value}: {count} data points")

# Find the minority and majority classes
minority_class = class_counts.idxmin()
majority_class = class_counts.idxmax()

print(f"Minority class: {minority_class}")
print(f"Majority class: {majority_class}")

Target value 1: 359 data points
Target value 0: 283 data points
Minority class: 0
Majority class: 1


In [203]:
data.head()

Unnamed: 0,id,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
1,1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,4,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0
5,5,54,M,ATA,110,208,0,Normal,142,N,0.0,Up,0


In [204]:
mapping_sex = {'M': 1, 'F': 0}
mapping_chest_pain = {'TA': 1, 'ATA': 2, 'NAP': 3, 'ASY': 4}
mapping_resting_ecg = {'Normal': 0, 'ST': 1, 'LVH': 2}
mapping_exercise_angina = {'Y': 1, 'N': 0}
mapping_st_slope = {'Up': 1, 'Flat': 2, 'Down': 3}

In [205]:
data['Sex'] = data['Sex'].map(mapping_sex)
data['ChestPainType'] = data['ChestPainType'].map(mapping_chest_pain)
data['RestingECG'] = data['RestingECG'].map(mapping_resting_ecg)
data['ExerciseAngina'] = data['ExerciseAngina'].map(mapping_exercise_angina)
data['ST_Slope'] = data['ST_Slope'].map(mapping_st_slope)

In [206]:
data.head()

Unnamed: 0,id,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
1,1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,4,39,1,3,120,339,0,0,170,0,0.0,1,0
5,5,54,1,2,110,208,0,0,142,0,0.0,1,0


In [207]:
independent_vars = ['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS', 'RestingECG',
                    'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope']
X = data[independent_vars]
y = data['HeartDisease']

In [208]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=16)

In [209]:
ros = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [210]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.5, random_state=16)

In [211]:
logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [212]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8623188405797102


In [213]:
data2 = pd.read_csv("heart_test.csv", header=0, names=col_names[:-1], low_memory=False)

In [214]:
data2['Sex'] = data2['Sex'].map(mapping_sex)
data2['ChestPainType'] = data2['ChestPainType'].map(mapping_chest_pain)
data2['RestingECG'] = data2['RestingECG'].map(mapping_resting_ecg)
data2['ExerciseAngina'] = data2['ExerciseAngina'].map(mapping_exercise_angina)
data2['ST_Slope'] = data2['ST_Slope'].map(mapping_st_slope)

In [215]:
X_test_2 = data2[independent_vars]
X_id = data2['id']

In [216]:
logreg2 = LogisticRegression(max_iter=10000)
logreg2 = logreg.fit(X_train, y_train)
y_pred_2 = logreg.predict(X_test_2)

In [217]:
predictions_df = pd.DataFrame({'id': X_id, 'output': y_pred_2})

In [218]:
predictions_df.to_csv('predictions.csv', index=False)