In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report


In [None]:
df = pd.read_csv("pima_indian_diabetes.csv")

In [None]:
print(df.head())
print(df.info())

In [None]:
for col in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    df[col] = df[col].replace(0, np.nan)

In [None]:
df.fillna(df.median(), inplace=True)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
z_scores = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))
df = df[(z_scores < 3).all(axis=1)]

In [None]:
print(df.describe())
print("Remaining duplicates:", df.duplicated().sum())

In [None]:
for col in df.columns[:-1]:  # Exclude 'Outcome' as it’s categorical
    print(f"Statistics for {col}:")
    print("  Frequency:\n", df[col].value_counts())
    print("  Mean:", df[col].mean())
    print("  Median:", df[col].median())
    print("  Mode:", df[col].mode()[0])
    print("  Variance:", df[col].var())
    print("  Standard Deviation:", df[col].std())
    print("  Skewness:", df[col].skew())
    print("  Kurtosis:", df[col].kurtosis())
    print("\n")


In [None]:

df.hist(bins=15, figsize=(15, 10))
plt.show()


In [None]:
# Linear regression on 'Glucose' predicting 'Age'
X = df[['Glucose']]
y = df['Age']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)

In [None]:



print("Linear Regression Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Linear Regression R^2 Score:", r2_score(y_test, y_pred))


In [None]:

X = df.drop(columns=['Outcome'])
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)


In [None]:
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Multiple regression using several predictors for 'Age'
X = df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction']]
y = df['Age']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
multi_reg = LinearRegression()
multi_reg.fit(X_train, y_train)
y_pred = multi_reg.predict(X_test)

In [None]:

print("Multiple Regression Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Multiple Regression R^2 Score:", r2_score(y_test, y_pred))
