In [1]:
# https://drive.google.com/file/d/1vViA8Zi--LANScogg_Ezu7BHli3NQfzV/view?usp=sharing
!gdown '1vViA8Zi--LANScogg_Ezu7BHli3NQfzV'
!mkdir dataset
!unzip -q '/content/archive (14).zip' -d '/content/dataset'

Downloading...
From: https://drive.google.com/uc?id=1vViA8Zi--LANScogg_Ezu7BHli3NQfzV
To: /content/archive (14).zip
  0% 0.00/9.13k [00:00<?, ?B/s]100% 9.13k/9.13k [00:00<00:00, 7.59MB/s]


In [12]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
from scipy import stats
from sklearn.metrics import accuracy_score

# Load dữ liệu
file_path = "/content/dataset/diabetes.csv"
df = pd.read_csv(file_path)

# Xác định các cột cần chuyển 0 thành NaN
zero_to_null_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']
df[zero_to_null_cols] = df[zero_to_null_cols].replace(0, np.nan)

# Xử lý dữ liệu thiếu bằng mean/mode
imputer = SimpleImputer(strategy='mean')
df[zero_to_null_cols] = imputer.fit_transform(df[zero_to_null_cols])

# Xử lý outlier bằng Z-score
zscore_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI']
z_scores = np.abs(stats.zscore(df[zscore_cols]))
df[zscore_cols] = np.where(z_scores > 3, df[zscore_cols].median(), df[zscore_cols])

# Xử lý outlier bằng IQR
iqr_cols = ['Pregnancies', 'Insulin', 'DiabetesPedigreeFunction', 'Age']
Q1 = df[iqr_cols].quantile(0.25)
Q3 = df[iqr_cols].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df[iqr_cols] = df[iqr_cols].clip(lower=lower_bound, upper=upper_bound, axis=1)

# Chuẩn hóa dữ liệu Min/Max
scaler = MinMaxScaler()
df.iloc[:, :-1] = scaler.fit_transform(df.iloc[:, :-1])

# Cân bằng dữ liệu bằng OverSampling (SMOTE)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Chia dữ liệu train/test (85% train, 15% test)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.15, random_state=42)

# Huấn luyện MLP model
mlp = MLPClassifier(hidden_layer_sizes=(32, 32, 16, 16, 8, 8, 4), max_iter=1000, random_state=42)
mlp.fit(X_train, y_train)

# Dự đoán và đánh giá độ chính xác
y_train_pred = mlp.predict(X_train)
y_test_pred = mlp.predict(X_test)
train_accuracy = accuracy_score(y_train, y_train_pred) * 100
test_accuracy = accuracy_score(y_test, y_test_pred) * 100

train_accuracy, test_accuracy


(89.52941176470588, 78.0)