In [15]:
# Feature Engineering
# Select the top 10 features based on the chi-squared test
import pandas as pd

# Load your dataset into a pandas DataFrame
data = pd.read_csv('heart.csv.xls')

# Now you can use the 'data' variable
X = data.drop('target', axis=1)
y = data['target']

from sklearn.feature_selection import SelectKBest, chi2
selector = SelectKBest(chi2, k=10)
x_new = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]

print('Selected Features: ', selected_features)


Selected Features:  Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'thalach', 'exang', 'oldpeak',
       'slope', 'ca'],
      dtype='object')


In [16]:
# Notes on the datasets
# target column is the heart-disease-status, where, 0 = no heart - disease
# 1, presence of the heart-disease

In [17]:
# Feature scaling 
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import MinMaxScaler

# Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Normalize the features
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)


In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('heart.csv.xls')

# Display the first few rows of the dataset
print(df.head())
print(df.info())

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   3       145   233    1        0      150      0      2.3      0   
1   37    1   2       130   250    0        1      187      0      3.5      0   
2   41    0   1       130   204    0        0      172      0      1.4      2   
3   56    1   1       120   236    0        1      178      0      0.8      2   
4   57    0   0       120   354    0        1      163      1      0.6      2   

   ca  thal  target  
0   0     1       1  
1   0     2       1  
2   0     2       1  
3   0     2       1  
4   0     2       1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5 

Feature Engineering

In [2]:
from sklearn.feature_selection import SelectKBest, chi2

# Separate features and target variable
X = df.drop('target', axis=1)
y = df['target']

# Apply SelectKBest with chi2
chi2_selector = SelectKBest(chi2, k='all')
X_kbest = chi2_selector.fit_transform(X, y)

# Get the scores for each feature
chi2_scores = chi2_selector.scores_
print(chi2_scores)

[ 23.28662399   7.57683451  62.59809791  14.8239245   23.93639448
   0.20293368   2.97827075 188.32047169  38.91437697  72.64425301
   9.8040952   66.44076512   5.79185297]


In [3]:
from sklearn.feature_selection import f_classif

# Apply SelectKBest with ANOVA
anova_selector = SelectKBest(f_classif, k='all')
X_anova = anova_selector.fit_transform(X, y)

# Get the scores for each feature
anova_scores = anova_selector.scores_
print(anova_scores)

[16.11669982 25.79219115 69.77227149  6.45816867  2.20298345  0.23694234
  5.77720891 65.1201044  70.95243822 68.55143941 40.90207063 54.5598338
 40.40769615]


In [4]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Initialize the model
model = LogisticRegression(max_iter=1000)

# Apply RFE
rfe_selector = RFE(model, n_features_to_select=5)
X_rfe = rfe_selector.fit_transform(X, y)

# Get the ranking of features
rfe_ranking = rfe_selector.ranking_
print(rfe_ranking)

[9 1 2 6 8 7 4 5 1 3 1 1 1]


In [5]:
from sklearn.linear_model import LassoCV

# Apply LassoCV
lasso = LassoCV(cv=5)
lasso.fit(X, y)

# Get the coefficients
lasso_coefficients = lasso.coef_
print(lasso_coefficients)

[-0.00062785 -0.17883011  0.11255077 -0.00202927 -0.0003682   0.
  0.03472925  0.00336866 -0.12461461 -0.06254206  0.06357207 -0.09679258
 -0.11236982]


Data Splitting

In [6]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('Training set size:', X_train.shape)
print('Testing set size:', X_test.shape)

Training set size: (242, 13)
Testing set size: (61, 13)


In [7]:
from sklearn.model_selection import cross_val_score, KFold

# Initialize k-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate the model using cross-validation
cv_scores = cross_val_score(model, X, y, cv=kf)
print('k-Fold CV scores:', cv_scores)
print('Mean CV score:', cv_scores.mean())

k-Fold CV scores: [0.8852459  0.83606557 0.81967213 0.71666667 0.86666667]
Mean CV score: 0.8248633879781421


Supervised Learning Algorithms

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize and train the model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Predict and evaluate
y_pred = log_reg.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, y_pred)
print('Logistic Regression Accuracy:', log_reg_accuracy)

Logistic Regression Accuracy: 0.8852459016393442


In [10]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize and train the model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Predict and evaluate
y_pred = knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, y_pred)
print('KNN Accuracy:', knn_accuracy)

KNN Accuracy: 0.6885245901639344


In [11]:
from sklearn.svm import SVC

# Initialize and train the model
svm = SVC()
svm.fit(X_train, y_train)

# Predict and evaluate
y_pred = svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred)
print('SVM Accuracy:', svm_accuracy)

SVM Accuracy: 0.7049180327868853


In [12]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and train the model
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# Predict and evaluate
y_pred = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred)
print('Decision Tree Accuracy:', dt_accuracy)

Decision Tree Accuracy: 0.819672131147541


In [13]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred)
print('Random Forest Accuracy:', rf_accuracy)

Random Forest Accuracy: 0.8360655737704918


Unsupervised Learning

In [14]:
from sklearn.cluster import KMeans

# Initialize and fit the model
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

# Get the cluster labels
cluster_labels = kmeans.labels_
print('Cluster labels:', cluster_labels)

Cluster labels: [1 1 0 1 2 0 2 1 0 0 1 1 1 0 1 0 2 1 1 1 1 1 0 1 0 2 0 0 2 0 0 0 0 1 0 0 2
 1 1 2 2 1 0 1 2 2 1 1 0 1 1 2 1 0 1 0 0 1 0 2 1 2 0 0 0 0 0 1 0 0 1 0 0 1
 0 1 1 0 0 1 1 2 2 2 1 2 1 0 0 1 1 0 0 1 0 1 2 1 2 1 1 1 0 1 0 0 1 1 1 1 2
 0 2 0 1 0 0 0 0 1 2 1 1 1 0 0 0 1 0 1 0 1 2 1 2 1 0 0 0 1 2 2 0 0 0 1 1 1
 0 0 1 0 1 1 0 0 1 0 0 0 1 2 0 0 0 1 1 1 1 0 1 0 1 0 0 0 1 2 0 1 2 1 2 1 1
 1 1 1 1 0 2 0 0 1 0 2 1 1 1 1 0 1 1 1 0 1 1 1 0 0 1 1 0 2 1 2 1 2 1 1 2 0
 1 1 1 0 1 0 1 2 1 1 1 1 2 2 2 2 2 1 1 1 0 1 0 1 2 1 1 1 2 1 2 2 1 2 1 0 1
 1 1 1 1 1 0 0 2 0 1 1 1 1 1 1 1 0 0 1 2 0 2 0 0 0 0 2 0 1 2 0 0 2 1 0 0 0
 0 0 1 1 0 0 1]


In [15]:
from sklearn.decomposition import PCA

# Initialize and fit the model
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

print('PCA components:', X_pca)

PCA components: [[-1.22673448e+01 -2.87383781e+00]
 [ 2.69013712e+00  3.98713736e+01]
 [-4.29502141e+01  2.36368199e+01]
 [-1.09447564e+01  2.84380358e+01]
 [ 1.06979053e+02  1.58744678e+01]
 [-5.36472038e+01 -4.00392377e+00]
 [ 4.80821551e+01  3.03896086e+00]
 [ 1.55660590e+01  2.62745637e+01]
 [-4.54388962e+01  7.42977739e+00]
 [-7.73163161e+01  1.99100759e+01]
 [-6.94323738e+00  9.17536374e+00]
 [ 2.84259379e+01 -8.47126467e+00]
 [ 1.92477377e+01  2.24212575e+01]
 [-3.58055558e+01 -5.70343964e+00]
 [ 3.75886896e+01  1.02579437e+01]
 [-2.79998518e+01  9.64570379e+00]
 [ 9.29750393e+01  2.42584272e+01]
 [-1.86307973e+01 -3.92597411e+01]
 [ 1.00245407e+00  2.10404091e+01]
 [-6.28419048e+00 -2.34469473e+00]
 [-1.19811457e+01  9.66308502e+00]
 [-1.39453649e+01  3.05172542e+01]
 [-2.05255102e+01  2.87200024e+01]
 [-2.03216541e+00 -1.55254511e+01]
 [-4.75528071e+01  2.85622656e+01]
 [ 5.75441203e+01  7.18762436e+00]
 [-3.31978406e+01  3.78773342e+00]
 [-7.20962852e+01 -2.45271249e+01]
 [ 1

In [19]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

df_apriori = df.gt(0).astype(bool)  # More efficient conversion to boolean

# Apply Apriori algorithm with placeholder values (replace with your desired settings)
frequent_itemsets = apriori(df_apriori, min_support=0.1, use_colnames=True)

# Generate association rules with placeholder values (replace with your desired settings)
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1.0)
print('Association rules:', rules)


Association rules:        antecedents                                        consequents  \
0            (age)                                              (sex)   
1            (sex)                                              (age)   
2             (cp)                                              (age)   
3            (age)                                               (cp)   
4            (age)                                         (trestbps)   
...            ...                                                ...   
148457    (target)  (chol, thalach, oldpeak, restecg, thal, cp, ag...   
148458      (thal)  (chol, thalach, oldpeak, restecg, target, cp, ...   
148459        (cp)  (chol, thalach, oldpeak, restecg, target, thal...   
148460       (age)  (chol, thalach, oldpeak, restecg, target, thal...   
148461  (trestbps)  (chol, thalach, oldpeak, restecg, target, thal...   

        antecedent support  consequent support   support  confidence  \
0                 1.000000      