In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [2]:
train_data = pd.read_csv("heart_train.csv")
train_data.head(5)

Unnamed: 0.1,Unnamed: 0,id,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,0,0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,1,1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,2,2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,3,3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,4,4,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0


In [3]:
test_data = pd.read_csv("heart_test.csv")
test_data.head(5)

Unnamed: 0.1,Unnamed: 0,id,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,0,643,69,M,ASY,122,216,1,LVH,84,Y,0.0,Flat
1,1,644,74,M,TA,145,216,1,Normal,116,Y,1.8,Flat
2,2,645,66,F,NAP,146,278,0,LVH,152,N,0.0,Flat
3,3,646,53,M,ASY,144,300,1,ST,128,Y,1.5,Flat
4,4,647,41,F,ATA,105,198,0,Normal,168,N,0.0,Up


In [4]:
train_data.drop(columns=train_data.columns[0], axis=1, inplace=True)
test_data.drop(columns=test_data.columns[0], axis=1, inplace=True)

Convert the following data types which are 'object' to 'int64':

In [5]:
train_data.dtypes

id                  int64
Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

In [10]:
X = train_data[['Age', 'Sex', 'ChestPainType', 'RestingBP','FastingBS','RestingECG','MaxHR','ExerciseAngina','Oldpeak','ST_Slope']].values
X[0:9]

array([[40, 'M', 'ATA', 140, 0, 'Normal', 172, 'N', 0.0, 'Up'],
       [49, 'F', 'NAP', 160, 0, 'Normal', 156, 'N', 1.0, 'Flat'],
       [37, 'M', 'ATA', 130, 0, 'ST', 98, 'N', 0.0, 'Up'],
       [48, 'F', 'ASY', 138, 0, 'Normal', 108, 'Y', 1.5, 'Flat'],
       [39, 'M', 'NAP', 120, 0, 'Normal', 170, 'N', 0.0, 'Up'],
       [54, 'M', 'ATA', 110, 0, 'Normal', 142, 'N', 0.0, 'Up'],
       [37, 'M', 'ASY', 140, 0, 'Normal', 130, 'Y', 1.5, 'Flat'],
       [48, 'F', 'ATA', 120, 0, 'Normal', 120, 'N', 0.0, 'Up'],
       [37, 'F', 'NAP', 130, 0, 'Normal', 142, 'N', 0.0, 'Up']],
      dtype=object)

In [11]:
# convert data types which are object to int64 such that logistic regression could identify the attribute.
from sklearn import preprocessing
Sex = preprocessing.LabelEncoder()
Sex.fit(['F','M'])
X[:,1] = Sex.transform(X[:,1]) 


ChestPainType = preprocessing.LabelEncoder()
ChestPainType.fit([ 'TA', 'ATA', 'NAP','ASY'])
X[:,2] = ChestPainType.transform(X[:,2])

RestingECG = preprocessing.LabelEncoder()
RestingECG.fit(['Normal','ST','LVH'])
X[:,5] = RestingECG.transform(X[:,5])

ExerciseAngina = preprocessing.LabelEncoder()
ExerciseAngina.fit(['Y','N'])
X[:,7] = ExerciseAngina.transform(X[:,7])

ST_Slope = preprocessing.LabelEncoder()
ST_Slope.fit(['Up','Flat','Down'])
X[:,9] = ST_Slope.transform(X[:,9])

X[0:9]

array([[40, 1, 1, 140, 0, 1, 172, 0, 0.0, 2],
       [49, 0, 2, 160, 0, 1, 156, 0, 1.0, 1],
       [37, 1, 1, 130, 0, 2, 98, 0, 0.0, 2],
       [48, 0, 0, 138, 0, 1, 108, 1, 1.5, 1],
       [39, 1, 2, 120, 0, 1, 170, 0, 0.0, 2],
       [54, 1, 1, 110, 0, 1, 142, 0, 0.0, 2],
       [37, 1, 0, 140, 0, 1, 130, 1, 1.5, 1],
       [48, 0, 1, 120, 0, 1, 120, 0, 0.0, 2],
       [37, 0, 2, 130, 0, 1, 142, 0, 0.0, 2]], dtype=object)

In [12]:
y = train_data['HeartDisease']
X = X.astype(np.int64)

Since the dependent variable has already been converted from categorical to numerical data, we do not need to convert it again.

In [13]:
# Assume X and y are already defined
model = LogisticRegression()

k = 10
scores = cross_val_score(model, X, y, cv=k, scoring='accuracy')

print("Accuracy scores for each fold:", scores)
print("Mean accuracy:", np.mean(scores))
print("Standard deviation of accuracy:", np.std(scores))

Accuracy scores for each fold: [0.81538462 0.90769231 0.83076923 0.890625   0.84375    0.8125
 0.875      0.796875   0.8125     0.78125   ]
Mean accuracy: 0.8366346153846154
Standard deviation of accuracy: 0.03973146850415979


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt