In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 
  
# data (as pandas dataframes) 
X = heart_disease.data.features 
y = heart_disease.data.targets 
  
# metadata 
print(heart_disease.metadata) 
  
# variable information 
print(heart_disease.variables) 

data_features = pd.DataFrame(heart_disease.data.features, columns=heart_disease.data.feature_names)
data_targets = pd.Series(heart_disease.data.targets.squeeze(), name='Outcome')

data_features.head()
columns_to_encode = ['sex','cp','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal']
heart_df = pd.get_dummies(data_features[columns_to_encode])

{'uci_id': 45, 'name': 'Heart Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/45/heart+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/45/data.csv', 'abstract': '4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 303, 'num_features': 13, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['num'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1989, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C52P4X', 'creators': ['Andras Janosi', 'William Steinbrunn', 'Matthias Pfisterer', 'Robert Detrano'], 'intro_paper': {'ID': 231, 'type': 'NATIVE', 'title': 'International application of a new probability algorithm for the diagnosis of coronary artery disease.', 'authors': 'R. Detrano, A. Jánosi, W. Steinbrunn, M

In [15]:
heart_df = heart_df[['sex','cp','chol','fbs','restecg', 'exang','oldpeak','slope','ca']].dropna()
X = heart_df.drop(['sex'], axis = 1)
y = heart_df['sex']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train) 
X_test = sc.fit_transform(X_test)

knn = KNeighborsClassifier(n_neighbors=15) 
knn.fit(X_train, y_train) 

y_pred = knn.predict(X_test)
print(y_pred)
print(type(y_pred))
print(knn.score(X_test, y_test))

[1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 0
 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1]
<class 'numpy.ndarray'>
0.7333333333333333


In [17]:
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
svd = TruncatedSVD(n_components=8)
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.fit_transform(X_test)

model = LogisticRegression(random_state=42).fit(X_train_svd, y_train)

model.score(X_test_svd, y_test)

knn = KNeighborsClassifier(n_neighbors=15) 
knn.fit(X_train_svd, y_train) 

y_pred = knn.predict(X_test_svd)
print(y_pred)
print(type(y_pred))

print(knn.score(X_test_svd, y_test))

[1 1 1 1 0 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 0 1 1 0 1 1 1 0 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]
<class 'numpy.ndarray'>
0.7111111111111111


The KNN model performs slightly worse when the Truncated SVD is used to factorize the values. However as the number of components is increased (with a max value of 8), the KNN score gets slightly better, with a score of 0.6555 for n_components=6 and 0.7111 for n_components=8. This can pro