# Normalisasi Data Training dan Testing Metode Min-Max Scaling

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load the datasets
data_testing = pd.read_csv('../../data_testing.csv')  # Replace with your testing file path
data_training = pd.read_csv('../../data_training.csv')  # Replace with your training file path

# Separate features (X) and target (y)
X_train = data_training.drop(columns='Outcome')
y_train = data_training['Outcome']
X_test = data_testing.drop(columns='Outcome')
y_test = data_testing['Outcome']

# Initialize the Min-Max Scaler
scaler = MinMaxScaler()

# Apply Min-Max Scaling to the features
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Add the target column back to the scaled DataFrame
data_training_scaled = pd.concat([X_train_scaled, y_train.reset_index(drop=True)], axis=1)
data_testing_scaled = pd.concat([X_test_scaled, y_test.reset_index(drop=True)], axis=1)

print("Normalization completed. Scaled data stored in 'data_training_scaled' and 'data_testing_scaled'.")


Normalization completed. Scaled data stored in 'data_training_scaled' and 'data_testing_scaled'.


# Tampilkan Data Sebelum dan Sesudah Normalisasi

## Data Training

In [2]:
# Display original and normalized data for training dataset
print("Original Training Data (First 5 Rows):")
print(data_training.head())

print("\nNormalized Training Data (First 5 Rows):")
print(data_training_scaled.head())


Original Training Data (First 5 Rows):
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            2       84              0              0        0   0.0   
1            9      112             82             24        0  28.2   
2            1      139             46             19       83  28.7   
3            0      161             50              0        0  21.9   
4            6      134             80             37      370  46.2   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.304   21        0  
1                     1.282   50        1  
2                     0.654   22        0  
3                     0.254   65        0  
4                     0.238   46        1  

Normalized Training Data (First 5 Rows):
   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0     0.117647  0.422111       0.000000       0.000000  0.000000  0.000000   
1     0.529412  0.562814       0.672131       0.380952  0.000000  

# Data Testing

In [3]:
# Display original and normalized data for testing dataset
print("\nOriginal Testing Data (First 5 Rows):")
print(data_testing.head())

print("\nNormalized Testing Data (First 5 Rows):")
print(data_testing_scaled.head())



Original Testing Data (First 5 Rows):
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6       98             58             33      190  34.0   
1            2      112             75             32        0  35.7   
2            2      108             64              0        0  30.8   
3            8      107             80              0        0  24.6   
4            7      136             90              0        0  29.9   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.430   43        0  
1                     0.148   21        0  
2                     0.158   21        0  
3                     0.856   34        0  
4                     0.210   50        0  

Normalized Testing Data (First 5 Rows):
   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0     0.352941  0.492462       0.475410       0.523810  0.224586  0.506706   
1     0.117647  0.562814       0.614754       0.507937  0.000000  0

# Latih Model KNN

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Separate features and target variable
X_train = data_training_scaled.drop(columns='Outcome')
y_train = data_training_scaled['Outcome']
X_test = data_testing_scaled.drop(columns='Outcome')
y_test = data_testing_scaled['Outcome']

# Define values of K
k_values = [1, 3, 5, 7, 9, 17]

# Dictionary to store models and their classification reports
knn_models = {}
knn_reports = {}

# Train KNN models and store models
for k in k_values:
    # Initialize KNN model
    knn = KNeighborsClassifier(n_neighbors=k)
    
    # Train the model
    knn.fit(X_train, y_train)
    
    # Store the trained model
    knn_models[k] = knn


# Evaluasi Model

In [5]:
# Evaluate each model and print classification report
for k, knn in knn_models.items():
    # Predict on the test set
    y_pred = knn.predict(X_test)
    
    # Generate classification report
    report = classification_report(y_test, y_pred, target_names=["0", "1"])
    knn_reports[k] = report
    
    print(f"Classification Report for K={k}:\n")
    print(report)
    print("="*150)


Classification Report for K=1:

              precision    recall  f1-score   support

           0       0.73      0.75      0.74        99
           1       0.53      0.51      0.52        55

    accuracy                           0.66       154
   macro avg       0.63      0.63      0.63       154
weighted avg       0.66      0.66      0.66       154

Classification Report for K=3:

              precision    recall  f1-score   support

           0       0.77      0.80      0.79        99
           1       0.62      0.58      0.60        55

    accuracy                           0.72       154
   macro avg       0.69      0.69      0.69       154
weighted avg       0.72      0.72      0.72       154

Classification Report for K=5:

              precision    recall  f1-score   support

           0       0.75      0.77      0.76        99
           1       0.56      0.53      0.54        55

    accuracy                           0.68       154
   macro avg       0.65      0.6