In [123]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

In [None]:
data=pd.read_csv('/content/heart.csv')
data.head()

Checking for missing values

In [None]:
data.isna().sum()

In [None]:
data.count()

Drop NA values

In [89]:
transactions=[]
for index,row in data.iterrows():
  transactions.append(row.dropna().tolist())

In [None]:
data.count()

Label Encoding
Categorical --> Numerical

In [None]:
columns = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'Oldpeak', 'ST_Slope']
label_encoder = LabelEncoder()
for column in columns:
    data[column] = label_encoder.fit_transform(data[column])
print(data.head())

Binning

In [None]:
bins = [28,33,38,43,48,53,58,63,68,73,78,83]
labels = ['25-29','30-34','35-39','40-44','45-49','50-54','55-59','60-64','65-69','70-74','75-79']
data['age_bin'] = pd.cut(data['Age'], bins=bins, labels=labels)
print(data.head())

In [None]:
data.head()

## **Plotting Graphs**

### **Scatterplot**

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=data, x='Age', y='Cholesterol')
plt.title('Scatter Plot: Age vs Cholesterol')
plt.xlabel('Age')
plt.ylabel('Cholesterol')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=data, x='Age', y='Oldpeak')
plt.title('Scatter Plot: Age vs Oldpeak')
plt.xlabel('Age')
plt.ylabel('Cholesterol')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=data, x='MaxHR', y='Oldpeak')
plt.title('Scatter Plot: MaxHR vs Oldpeak')
plt.xlabel('MaxHR')
plt.ylabel('Oldpeak')
plt.show()

### **Boxplot**

In [None]:
sns.boxplot(data=data, y='Cholesterol')
plt.show()

In [None]:
sns.boxplot(data=data, y='Oldpeak')
plt.show()

In [None]:
sns.boxplot(data=data, y='RestingBP')
plt.show()

### **Identifying and dropping outliers**

In [None]:
data = data[data['Cholesterol'] >= 85]
data = data[data['Cholesterol'] <= 380]
data.reset_index(drop=True, inplace=True)
print(data.head())

In [None]:
data.count()

In [None]:
sns.boxplot(data=data, y='Cholesterol')
plt.show()

### **Correaltion matrix**

In [None]:

numeric_columns = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak', 'HeartDisease']
correlation_matrix = data[numeric_columns].corr()

### **HeatMap**

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
data.head()

Covariance

In [None]:
numeric_columns = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak','HeartDisease']
covariance_matrix = data[numeric_columns].cov()
print("Covariance Matrix:")
print(covariance_matrix)

## **Statistical Techniques**

### **CHI-SQUARE** **TEST**

Testing the assosciation of ChestPainType and the HeartDisease

In [None]:
contingency_table = pd.crosstab(data['ChestPainType'], data['HeartDisease'])
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)
print("Chi-Square Statistic:", chi2_stat)
print("P-Value:", p_val)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:", expected)

In [None]:
contingency_table = pd.crosstab(data['RestingECG'], data['HeartDisease'])
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)
print("Chi-Square Statistic:", chi2_stat)
print("P-Value:", p_val)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:", expected)

Testing the assosciation btwn Multiple columns with HeartDisease

In [None]:
contingency_table = pd.crosstab([data['ChestPainType'], data['RestingECG'], data['ExerciseAngina']], data['HeartDisease'])
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)
print("Chi-Square Statistic:", chi2_stat)
print("P-Value:", p_val)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:")
print(pd.DataFrame(expected))

## **T-Test**

From the Heat-map we have identified Oldpeak and Heartdisease has a strong correlation. Age and HeartDisease also have correlation.

In [120]:
from scipy.stats import ttest_ind

In [None]:
group_0_chol = data[data['HeartDisease'] == 0]['Age']
group_1_chol = data[data['HeartDisease'] == 1]['Age']
t_stat, p_value = ttest_ind(group_0_chol, group_1_chol)
print("T-Statistic:", t_stat)
print("P-Value:", p_value)

The Results of the t-test provide strong evidence that there is a significant difference in the mean age between individuals with heart disease and those without heart disease.

In [None]:
group_0_chol = data[data['HeartDisease'] == 0]['Oldpeak']
group_1_chol = data[data['HeartDisease'] == 1]['Oldpeak']
t_stat, p_value = ttest_ind(group_0_chol, group_1_chol)
print("T-Statistic:", t_stat)
print("P-Value:", p_value)

The negative t-test value indicates that the group without heart disease tends to have lower average 'Oldpeak' values compared to the group with heart disease. The small p-value suggests that this difference is unlikely to be due to random chance alone.