# Pima Indians Diabetes Dataset - EDA and k-NN Classification

This notebook contains a professional exploratory data analysis (EDA) and k-Nearest Neighbors classification task on the Pima Indians Diabetes dataset. The goal is to investigate data quality, understand feature relationships, and develop a predictive model using cross-validation to select the best k value.

In [None]:
# Import essential libraries for data analysis, visualization, and modeling
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

In [None]:
# Load the Pima Indians Diabetes dataset
data_path = '../data/Pima.csv'
df = pd.read_csv(data_path)
df.head()

In [None]:
# Inspect the dataset structure and data types
df.info()

In [None]:
# View summary statistics for numerical features
df.describe()

In [None]:
# Visualize pairwise relationships between features colored by outcome
sns.pairplot(df, hue='Outcome')

In [None]:
# Display the correlation heatmap to understand relationships between features
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
# Split the dataset into features (X) and target variable (y)
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Perform cross-validation to find the optimal number of neighbors (k)
k_range = range(1, 16)
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    score = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy').mean()
    scores.append(score)

In [None]:
# Plot cross-validated accuracy scores for different values of k
plt.plot(k_range, scores)
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Cross-Validated Accuracy')
plt.title('KNN Hyperparameter Tuning')
plt.show()

In [None]:
# Determine the value of k with the highest cross-validated accuracy
best_k = k_range[np.argmax(scores)]
best_k

In [None]:
# Train the final k-NN model using the selected optimal k
final_model = KNeighborsClassifier(n_neighbors=best_k)
final_model.fit(X_train, y_train)

In [None]:
# Evaluate the final model on the test set
test_score = final_model.score(X_test, y_test)
test_score