In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


#step1:upload csv file
!gdown --id 1Db3cWDkOvKiWQGqwIIBM5fRvkG0PegZ6
df = pd.read_csv('diabetest.csv')

# step2: discover data
df.shape

# step2:discover data
df.describe()


#step3: cleaning dataset
#missing value
df.isnull().sum()


#step3:cleaning dataset
#outliers
plt.figure(figsize=(14,5))
df.boxplot()

#count the outliers
print(df['Insulin'].describe())

def Discover_outliers(df,column):
  Q1 = df[column].quantile(0.25)
  Q3 = df[column].quantile(0.75)
  IQR = Q3 -Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  #identify outlier
  outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
  #replace outlier with median
  df.loc[outliers.index, column] = df[column].median()
  return df

for column in df.columns:
    if column != "Outcome":
        df = Discover_outliers(df, column)



#step3:cleaning dataset
#outliers
plt.figure(figsize=(14,5))
df.boxplot()

#count the outliers
print(df['Insulin'].describe())


#فصل الخصائص عن الهدف
X = df.drop("Outcome", axis=1)
y = df["Outcome"]


#تقسيم البيانات
#80% تدريب
#20% اختبار
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


#للتأكيد
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)


#تطبيق ال scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# stdv للتأكيد
X_train_scaled.mean(axis=0)
X_train_scaled.std(axis=0)


#للتأكيد على ال Mean
X_train_scaled.mean(axis=0)


#استيراد النموذج
from sklearn.linear_model import LogisticRegression


#انشاء النموذج
log_model = LogisticRegression(max_iter=1000)


#تدريب النموذج
log_model.fit(X_train_scaled, y_train)


#التنبؤ
y_pred = log_model.predict(X_test_scaled)


#التقييم
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


from sklearn.metrics import confusion_matrix
#import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


#ٌRoc Curve
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

y_proba = log_model.predict_proba(X_test_scaled)[:,1]

fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0,1], [0,1], linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()


#Precesion Recall Curve
from sklearn.metrics import precision_recall_curve

precision, recall, _ = precision_recall_curve(y_test, y_proba)

plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.show()


#Feature importance
import pandas as pd
import matplotlib.pyplot as plt

# أسماء الخصائص (الأعمدة)
features = X.columns

# معاملات النموذج (الأوزان)
importance = log_model.coef_[0]

# إنشاء DataFrame
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': importance
})

# ترتيب الخصائص حسب الأهمية
feature_importance = feature_importance.sort_values(by='Importance')

# رسم الأهمية
plt.figure(figsize=(6,4))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Feature Importance (Logistic Regression)")
plt.show()
