In [153]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [154]:
# Baca dataset
df = pd.read_csv('anemia.csv')
df.head(5)

Unnamed: 0,Gender,Hemoglobin,MCH,MCHC,MCV,Result
0,1,14.9,22.7,29.1,83.7,0
1,0,15.9,25.4,28.3,72.0,0
2,0,9.0,21.5,29.6,71.2,1
3,0,14.9,16.0,31.4,87.5,0
4,1,14.7,22.0,28.2,99.5,0


In [155]:
# Cek tipe data
df.dtypes

Gender          int64
Hemoglobin    float64
MCH           float64
MCHC          float64
MCV           float64
Result          int64
dtype: object

Data Cleaning

In [156]:
# Cek nilai duplikat dan nilai kosong
print("There is {} duplicated values in data frame".format(df.duplicated().sum()))
print("Data columns with null value: \n{} ".format(df.isnull().sum()))

There is 887 duplicated values in data frame
Data columns with null value: 
Gender        0
Hemoglobin    0
MCH           0
MCHC          0
MCV           0
Result        0
dtype: int64 


In [157]:
duplicated = df[df.duplicated(keep=False)]
duplicated = duplicated.sort_values(by=['Gender', "Hemoglobin", "MCH",'MCHC','MCV',"Result"], ascending= False) 
duplicated.head()

Unnamed: 0,Gender,Hemoglobin,MCH,MCHC,MCV,Result
115,1,16.9,24.2,32.1,92.5,0
352,1,16.9,24.2,32.1,92.5,0
589,1,16.9,24.2,32.1,92.5,0
138,1,16.8,24.3,30.5,90.7,0
375,1,16.8,24.3,30.5,90.7,0


In [158]:
# Hapus duplikat
df.drop_duplicates(inplace=True)
print("There is {} duplicated values in data frame".format(df.duplicated().sum()))
print("There is {} missing values in data frame".format(df.isnull().sum().sum()))

There is 0 duplicated values in data frame
There is 0 missing values in data frame


In [159]:
# Pisahkan fitur dan target
X = df.drop('Result', axis=1)
y = df['Result']

In [160]:
# SMOTE untuk mengatasi ketidak samaan data
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)
print(smote)

SMOTE(random_state=42)


In [161]:
# Pisahkan data train dan data test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [162]:
# Scalling numerical 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [163]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Model 1 = Random Forest
model_randomm = RandomForestClassifier(n_estimators=2, class_weight='balanced', random_state=42)
model_randomm.fit(X_train_scaled, y_train)
y_pred_random = model_randomm.predict(X_test_scaled)
random_accuracy = accuracy_score(y_test, y_pred_random)
print("Random Forest Classifier Accuracy:", (random_accuracy))
print("Random Forest Classifier:")
print(classification_report(y_test, y_pred_random))

Random Forest Classifier Accuracy: 0.9826086956521739
Random Forest Classifier:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        62
           1       1.00      0.96      0.98        53

    accuracy                           0.98       115
   macro avg       0.98      0.98      0.98       115
weighted avg       0.98      0.98      0.98       115



In [164]:
from sklearn.tree import DecisionTreeClassifier
# Model 2 = Decision Tree Classifier
model_decision = DecisionTreeClassifier(class_weight='balanced', random_state=42)
model_decision.fit(X_train_scaled, y_train)
y_pred_decision = model_decision.predict(X_test_scaled)
decision_accuracy = accuracy_score(y_test, y_pred_decision)
print("Decisison Tree Accuracy:", (decision_accuracy))
print("Decision Tree Classifier:")
print(classification_report(y_test, y_pred_decision))

Decisison Tree Accuracy: 0.991304347826087
Decision Tree Classifier:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        62
           1       0.98      1.00      0.99        53

    accuracy                           0.99       115
   macro avg       0.99      0.99      0.99       115
weighted avg       0.99      0.99      0.99       115



In [165]:
from sklearn.neighbors import KNeighborsClassifier
# Model 3 = KNN
model_Kn = KNeighborsClassifier()
model_Kn.fit(X_train_scaled, y_train)
y_pred_Kn = model_Kn.predict(X_test_scaled)
knn_accuracy = accuracy_score(y_test, y_pred_Kn)
print("KNN Accuracy:", (knn_accuracy))
print("KNN:")
print(classification_report(y_test, y_pred_Kn))

KNN Accuracy: 0.9391304347826087
KNN:
              precision    recall  f1-score   support

           0       0.98      0.90      0.94        62
           1       0.90      0.98      0.94        53

    accuracy                           0.94       115
   macro avg       0.94      0.94      0.94       115
weighted avg       0.94      0.94      0.94       115



In [166]:
from sklearn.linear_model import LogisticRegression

# Model 4 = Logistic Regression
model_logistic = LogisticRegression(random_state=42)
model_logistic.fit(X_train_scaled, y_train)
y_pred_logistic = model_logistic.predict(X_test_scaled)
logistic_accuracy = accuracy_score(y_test, y_pred_logistic)
print("Logistic Regression Accuracy:", logistic_accuracy)
print("Logistic Regression:")
print(classification_report(y_test, y_pred_logistic))

Logistic Regression Accuracy: 0.9565217391304348
Logistic Regression:
              precision    recall  f1-score   support

           0       1.00      0.92      0.96        62
           1       0.91      1.00      0.95        53

    accuracy                           0.96       115
   macro avg       0.96      0.96      0.96       115
weighted avg       0.96      0.96      0.96       115



In [167]:
from sklearn.svm import SVC
# Model 5 = SVC
model_svc = SVC()
model_svc.fit(X_train_scaled, y_train)
y_pred_svc = model_svc.predict(X_test_scaled)
svc_accuracy = accuracy_score(y_test, y_pred_svc)
print("SVC Accuracy:", (svc_accuracy))
print("SVC:")
print(classification_report(y_test, y_pred_svc))

SVC Accuracy: 0.9478260869565217
SVC:
              precision    recall  f1-score   support

           0       1.00      0.90      0.95        62
           1       0.90      1.00      0.95        53

    accuracy                           0.95       115
   macro avg       0.95      0.95      0.95       115
weighted avg       0.95      0.95      0.95       115



In [168]:
# Semua Accuracy
print("Random Forest Accuracy:", (random_accuracy))
print("Decision Tree Accuracy:", decision_accuracy)
print("KNN Accuracy:", knn_accuracy)
print("Gradient Boosting Accuracy:", logistic_accuracy)
print("SVC Accuracy:", svc_accuracy)

Random Forest Accuracy: 0.9826086956521739
Decision Tree Accuracy: 0.991304347826087
KNN Accuracy: 0.9391304347826087
Gradient Boosting Accuracy: 0.9565217391304348
SVC Accuracy: 0.9478260869565217


In [181]:
# Data baru
new_data = {
        'Gender': 'male',  
        'Hemoglobin': 13,
        'MCH': 48.3,
        'MCHC':  30,
        'MCV': 73 

        # Gender	Hemoglobin	MCH	MCHC	MCV
}

# Convert new data ke DataFrame
new_data_df = pd.DataFrame([new_data])

# Mapping untuk Gender 
gender_map = {'male': 1, 'female': 0} 
new_data_df['Gender'] = new_data_df['Gender'].map(gender_map)

# SCalling dta numerik
new_data_scaled = scaler.transform(new_data_df)

# Prediksi
predicsi = model_decision.predict(new_data_scaled)
predict_hasil = predicsi[0]

label_map = {
        0: 'Tidak Anemia',
        1: 'Anemia'  
}
hasil_prediksi = [label_map[predict_hasil]]

print(f"Predict class (Encoded): {predict_hasil}")
print(f"Predict class (Decoded): {hasil_prediksi[0]}")

Predict class (Encoded): 1
Predict class (Decoded): Anemia


In [178]:
# import joblib

# joblib.dump(scaler, 'Scaler.pkl')
# joblib.dump(model_decision, 'Decision_model.pkl')

['Decision_model.pkl']