<a href="https://colab.research.google.com/github/tanuku-srivalli/Heart-Disease-Prediction-UCI-/blob/main/AIML_TASK_1_%3B_HEART.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd
from google.colab import files
import io

# 1. Prompt the user to upload the file
print("Please upload the 'heart.csv' file now.")
uploaded = files.upload()

# 2. Check if the file was uploaded
if 'heart.csv' in uploaded:
    # 3. Read the file into a Pandas DataFrame using a StringIO buffer
    # The uploaded file content is in bytes, which is decoded and read by io.StringIO
    df = pd.read_csv(io.StringIO(uploaded['heart.csv'].decode('utf-8')))

    print("\n✅ heart.csv loaded successfully!")
    print("\nInitial Data Inspection (First 5 rows):")

    # Display the first few rows to confirm loading
    print(df.head())

    # Run a quick check on the data types and nulls
    print("\nData Information:")
    df.info()

else:
    print("\n❌ Upload failed or the file name was incorrect. Please ensure you upload 'heart.csv'.")

Please upload the 'heart.csv' file now.


TypeError: 'NoneType' object is not subscriptable

In [None]:
# Assuming 'df' is the DataFrame loaded in the previous step

# Define feature lists based on typical heart.csv structure
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
target_col = 'target'

print("Starting Exploratory Data Analysis (EDA)...")

In [None]:
# ----------------------------------------------------------------------
# FIX: Import the plotting libraries if they haven't been imported yet
# ----------------------------------------------------------------------
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set a style for the plots (optional, but good for aesthetics)
sns.set_style("whitegrid")

# Define feature lists (assuming 'df' from the successful upload is available)
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
target_col = 'target'

print("Starting Exploratory Data Analysis (EDA)...")
# ----------------------------------------------------------------------


# --- 2A. Numerical Feature Distributions ---
print("\n--- 2A. Numerical Feature Distributions (Histograms and Box Plots) ---")

# 1. Histograms for Distributions
df[numerical_cols].hist(figsize=(15, 10), bins=20, edgecolor='black', grid=False)
plt.suptitle('Distribution of Numerical Features (Histograms)', y=1.02, fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.98])
plt.show()

# 2. Box Plots for Outlier Detection and Target Comparison
plt.figure(figsize=(15, 12))
for i, col in enumerate(numerical_cols):
    plt.subplot(3, 2, i + 1)
    # Check for outliers and how the distribution changes based on the target
    sns.boxplot(x=target_col, y=col, data=df, palette='viridis')
    plt.title(f'Box Plot of {col} by Target')
    plt.xlabel('Heart Disease (0=No, 1=Yes)')
plt.tight_layout()
plt.show()

# ----------------------------------------------------------------------

# --- 2B. Categorical Feature Counts ---
print("\n--- 2B. Categorical Feature Counts ---")
plt.figure(figsize=(18, 10))
for i, col in enumerate(categorical_cols):
    plt.subplot(3, 3, i + 1)
    sns.countplot(x=col, data=df, palette='Set2', edgecolor='black')
    plt.title(f'Count Plot of {col}')
    plt.xlabel(col)
plt.tight_layout()
plt.show()

# ----------------------------------------------------------------------

# --- 3. Bivariate Analysis (Feature vs. Target) ---
print("\n--- 3. Categorical Features vs. Target (Heart Disease) ---")
plt.figure(figsize=(18, 15))
for i, col in enumerate(categorical_cols):
    plt.subplot(3, 3, i + 1)

    # Plot as a grouped bar chart
    pd.crosstab(df[col], df[target_col]).plot(kind='bar', stacked=False, ax=plt.gca(), cmap='coolwarm')

    plt.title(f'{col} vs. Target', fontsize=14)
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.xticks(rotation=0)
    plt.legend(title='Target (0=No, 1=Yes)')

plt.tight_layout()
plt.show()

# ----------------------------------------------------------------------

# --- 4. Correlation Analysis (Heatmap) ---
print("\n--- 4. Correlation Analysis (Heatmap) ---")

# Calculate the correlation matrix for all columns
corr_matrix = df.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix,
            annot=True,
            fmt=".2f",
            cmap='coolwarm',
            linewidths=0.5,
            linecolor='black')
plt.title('Correlation Matrix of All Features', fontsize=16)
plt.show()

# Print correlations with the target variable
print("\nCorrelation with Target Variable:")
print(corr_matrix[target_col].sort_values(ascending=False))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# --- 1. Separate Features (X) and Target (y) ---
X = df.drop('target', axis=1) # Features (all columns except 'target')
y = df['target']             # Target variable

# --- 2. Handle Categorical Variables (One-Hot Encoding) ---
# We use pd.get_dummies() to convert categorical variables into dummy/indicator variables.
# This avoids incorrectly treating the categorical codes (like 0, 1, 2, 3) as numerical values.
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

print("Shape of data after One-Hot Encoding:")
print(X_encoded.shape)
print("\nFirst 5 rows of encoded data:")
print(X_encoded.head())

# --- 3. Feature Scaling (Standardization) ---
# Scaling is applied only to the numerical columns.

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the numerical columns
X_encoded[numerical_cols] = scaler.fit_transform(X_encoded[numerical_cols])

print("\nFirst 5 rows of scaled data (Numerical columns are standardized):")
print(X_encoded.head())

# --- 4. Split the Data into Training and Testing Sets ---
# This is the final step before model training. It ensures the model is evaluated on data it hasn't seen.
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y,
    test_size=0.2,        # Use 20% of the data for testing
    random_state=42,      # Ensures reproducible results
    stratify=y            # Ensures a balanced distribution of 'target' in both train/test sets
)

print(f"\nTraining set size (X_train, y_train): {X_train.shape}, {y_train.shape}")
print(f"Testing set size (X_test, y_test): {X_test.shape}, {y_test.shape}")

# The data is now ready for model training!

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Initialize the models
log_reg = LogisticRegression(random_state=42)
knn = KNeighborsClassifier() # Default n_neighbors=5
svm = SVC(random_state=42)

# Create a list of models to iterate over
models = {
    "Logistic Regression": log_reg,
    "K-Nearest Neighbors": knn,
    "Support Vector Machine (SVM)": svm
}

results = {}

print("--- 5. Training and Evaluating Machine Learning Models ---")

# Assuming X_train, X_test, y_train, y_test are available from Phase 4
for name, model in models.items():
    print(f"\nTraining {name}...")

    # 1. Train the model
    model.fit(X_train, y_train)

    # 2. Make predictions on the test set
    y_pred = model.predict(X_test)

    # 3. Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=['No Disease (0)', 'Disease (1)'], output_dict=True)

    # Store results
    results[name] = {
        "Accuracy": accuracy,
        "Report": report
    }

    # Print summary
    print(f"✅ {name} Accuracy: {accuracy:.4f}")
    print(f"Classification Report for {name}:")
    # Convert report dictionary to a string for clean printing
    print(classification_report(y_test, y_pred, target_names=['No Disease (0)', 'Disease (1)']))

print("\n--- Model Comparison Complete ---")

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt

# 1. Define the model
tuned_model = LogisticRegression(random_state=42)

# 2. Define the hyperparameter grid to search
# C: Inverse of regularization strength (smaller values specify stronger regularization)
# solver: Algorithm to use in the optimization problem
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs'],
    'penalty': ['l1', 'l2']
}
# Note: 'l1' penalty only works with 'liblinear' solver. GridSearchCV handles this.

# 3. Initialize GridSearchCV
# cv=5 means 5-fold cross-validation
grid_search = GridSearchCV(tuned_model, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

print("--- 6A. Starting Hyperparameter Tuning (Grid Search) ---")

# 4. Fit the grid search to the training data
# This step trains the model 5 times for every combination of parameters in the grid.
grid_search.fit(X_train, y_train)

# 5. Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_estimator = grid_search.best_estimator_

print(f"\n✅ Best Cross-Validation Accuracy: {best_score:.4f}")
print(f"✅ Best Parameters Found: {best_params}")