<a href="https://colab.research.google.com/github/zainoor/M8-DataMining/blob/main/TugasM8_KDM_Ramadhan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ✅ Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.metrics import classification_report, confusion_matrix
import time

In [None]:
# ✅ Load dataset (replace with your actual path or method)
df = pd.read_csv('/content/fraudTrain.csv')  # Update path as needed

In [None]:
# ✅ Drop irrelevant or high-cardinality columns
cols_to_drop = ['trans_date_trans_time', 'cc_num', 'merchant', 'first', 'last', 'street',
                'city', 'state', 'job', 'dob', 'trans_num', 'unix_time']
df = df.drop(columns=cols_to_drop)

In [None]:
# ✅ Drop rows with missing values (especially in target)
df = df.dropna(subset=['is_fraud'])
df = df.dropna()  # Optional: drop rows with any missing data

# ✅ Convert categorical columns to dummy variables
df = pd.get_dummies(df, drop_first=True)

In [None]:
# ✅ Sample the data to reduce memory (keep all fraud rows)
df_fraud = df[df['is_fraud'] == 1]
df_nonfraud = df[df['is_fraud'] == 0].sample(n=50000, random_state=42)  # balance
df_sampled = pd.concat([df_fraud, df_nonfraud]).sample(frac=1, random_state=42)

In [None]:
# ✅ Split into X/y
X = df_sampled.drop('is_fraud', axis=1)
y = df_sampled['is_fraud']

# ✅ Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Free memory
del df, df_fraud, df_nonfraud
gc.collect()

64

In [None]:
# Free memory
del df_sampled
gc.collect()

12

In [None]:
# ✅ Train Decision Tree with class_weight to handle imbalance
start_dt = time.time()

clf_dt = DecisionTreeClassifier(criterion='entropy', max_depth=6, class_weight='balanced', random_state=42)
clf_dt.fit(X_train, y_train)
y_pred_dt = clf_dt.predict(X_test)

end_dt = time.time()
waktu_dt = end_dt - start_dt


In [None]:
# ✅ Evaluation metrics
print("=== EVALUASI ===")
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))
# ✅ Output prediction results

print("\n=== PREDIKSI - Decision Tree ===")
print("Contoh 20 Prediksi:", y_pred_dt[:20])
print(f"\nWaktu Eksekusi Decision Tree: {waktu_dt:.4f} detik")

=== EVALUASI ===
[[9224  762]
 [  15   99]]
              precision    recall  f1-score   support

         0.0       1.00      0.92      0.96      9986
         1.0       0.11      0.87      0.20       114

    accuracy                           0.92     10100
   macro avg       0.56      0.90      0.58     10100
weighted avg       0.99      0.92      0.95     10100


=== PREDIKSI - Decision Tree ===
Contoh 20 Prediksi: [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]

Waktu Eksekusi Decision Tree: 33.5228 detik


In [None]:
# ✅ Human-readable rules from decision tree
print("=== RULES - Decision Tree ===")
tree_rules = export_text(clf_dt, feature_names=list(X.columns))
print(tree_rules)

=== RULES - Decision Tree ===
|--- amt <= 259.69
|   |--- amt <= 24.19
|   |   |--- category_gas_transport <= 0.50
|   |   |   |--- amt <= 6.01
|   |   |   |   |--- Unnamed: 0_4693 <= 0.50
|   |   |   |   |   |--- class: 0.0
|   |   |   |   |--- Unnamed: 0_4693 >  0.50
|   |   |   |   |   |--- class: 1.0
|   |   |   |--- amt >  6.01
|   |   |   |   |--- amt <= 15.05
|   |   |   |   |   |--- category_misc_pos <= 0.50
|   |   |   |   |   |   |--- class: 0.0
|   |   |   |   |   |--- category_misc_pos >  0.50
|   |   |   |   |   |   |--- class: 1.0
|   |   |   |   |--- amt >  15.05
|   |   |   |   |   |--- category_home <= 0.50
|   |   |   |   |   |   |--- class: 1.0
|   |   |   |   |   |--- category_home >  0.50
|   |   |   |   |   |   |--- class: 0.0
|   |   |--- category_gas_transport >  0.50
|   |   |   |--- Unnamed: 0_42499 <= 0.50
|   |   |   |   |--- class: 1.0
|   |   |   |--- Unnamed: 0_42499 >  0.50
|   |   |   |   |--- class: 0.0
|   |--- amt >  24.19
|   |   |--- category_food_

In [None]:
# ✅ Train Decision Tree with class_weight to handle imbalance
start_dt = time.time()

clf_dt = DecisionTreeClassifier(criterion='entropy', max_depth=6, class_weight='balanced', random_state=42)
clf_dt.fit(X_train, y_train)
y_pred_dt = clf_dt.predict(X_test)

end_dt = time.time()
waktu_dt = end_dt - start_dt


In [None]:
# ✅ Evaluation metrics
print("=== EVALUASI - Decision Tree ===")
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

# ✅ Output prediction results
print("\n=== PREDIKSI - Decision Tree ===")
print("Contoh 20 Prediksi:", y_pred_dt[:20])
print(f"\nWaktu Eksekusi Decision Tree: {waktu_dt:.4f} detik")

In [None]:
# # ✅ Custom IF...THEN... rules in the requested format
# def print_tree_rules(tree, feature_names):
#     tree_ = tree.tree_
#     feature_name = [
#         feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
#         for i in tree_.feature
#     ]

#     def recurse(node, depth, conditions):
#         if tree_.feature[node] != _tree.TREE_UNDEFINED:
#             name = feature_name[node]
#             threshold = tree_.threshold[node]
#             # Left branch (less than or equal to)
#             left_conditions = conditions + [f"{name} <= {threshold:.2f}"]
#             recurse(tree_.children_left[node], depth + 1, left_conditions)
#             # Right branch (greater than)
#             right_conditions = conditions + [f"{name} > {threshold:.2f}"]
#             recurse(tree_.children_right[node], depth + 1, right_conditions)
#         else:
#             classes = tree.classes_
#             pred = np.argmax(tree_.value[node])
#             # Construct the rule in "IF...THEN..." format
#             if conditions:
#                 rule = "IF " + " AND ".join(conditions) + f" THEN is_fraud = {classes[pred]}"
#                 print(rule)

#     # Start recursion from the root node (node 0) with empty conditions
#     recurse(0, 1, [])

# print("\n=== IF...THEN... RULES ===")
# print_tree_rules(clf_dt, list(X.columns))

In [None]:
# ✅ Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text, _tree
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
import time
import gc

In [None]:
# ✅ Load dataset and optimize memory usage
df = pd.read_csv('/content/fraudTrain.csv')

# ✅ Drop irrelevant/high-cardinality columns immediately
cols_to_drop = ['trans_date_trans_time', 'cc_num', 'merchant', 'first', 'last', 'street',
                'city', 'state', 'job', 'dob', 'trans_num', 'unix_time']
df = df.drop(columns=cols_to_drop)

In [None]:
# ✅ Optimize data types to reduce memory usage
for col in df.select_dtypes(include=['float64']).columns:
    df[col] = df[col].astype('float32')
for col in df.select_dtypes(include=['int64']).columns:
    df[col] = df[col].astype('int32')

In [None]:
# ✅ Drop rows with missing values
df = df.dropna(subset=['is_fraud'])
df = df.dropna()

# ✅ Select only relevant categorical columns for dummy variables
categorical_cols = ['category', 'gender']  # Limit to most relevant columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [None]:
# ✅ Sample data aggressively to reduce memory (keep all fraud rows)
df_fraud = df[df['is_fraud'] == 1]
df_nonfraud = df[df['is_fraud'] == 0].sample(n=50000, random_state=42)  # Reduced from 50,000
df_sampled = pd.concat([df_fraud, df_nonfraud]).sample(frac=1, random_state=42)

In [None]:
# Free memory
del df, df_fraud, df_nonfraud
gc.collect()

8

In [None]:
# ✅ Split into X/y
X = df_sampled.drop('is_fraud', axis=1)
y = df_sampled['is_fraud']

# ✅ Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Free memory
del df_sampled
gc.collect()

0

In [None]:
# ✅ Train Decision Tree with class_weight to handle imbalance
start_dt = time.time()

clf_dt = DecisionTreeClassifier(criterion='entropy', max_depth=6, class_weight='balanced', random_state=42)
clf_dt.fit(X_train, y_train)
y_pred_dt = clf_dt.predict(X_test)

end_dt = time.time()
waktu_dt = end_dt - start_dt


In [None]:
# ✅ Human-readable rules from decision tree
print("\n=== RULES - Decision Tree ===")
tree_rules = export_text(clf_dt, feature_names=list(X.columns))
print(tree_rules)


=== RULES - Decision Tree ===
|--- amt <= 259.69
|   |--- amt <= 24.19
|   |   |--- category_gas_transport <= 0.50
|   |   |   |--- amt <= 6.01
|   |   |   |   |--- Unnamed: 0 <= 4695.00
|   |   |   |   |   |--- Unnamed: 0 <= 4692.00
|   |   |   |   |   |   |--- class: 0.0
|   |   |   |   |   |--- Unnamed: 0 >  4692.00
|   |   |   |   |   |   |--- class: 1.0
|   |   |   |   |--- Unnamed: 0 >  4695.00
|   |   |   |   |   |--- class: 0.0
|   |   |   |--- amt >  6.01
|   |   |   |   |--- Unnamed: 0 <= 15503.50
|   |   |   |   |   |--- class: 0.0
|   |   |   |   |--- Unnamed: 0 >  15503.50
|   |   |   |   |   |--- amt <= 15.05
|   |   |   |   |   |   |--- class: 0.0
|   |   |   |   |   |--- amt >  15.05
|   |   |   |   |   |   |--- class: 1.0
|   |   |--- category_gas_transport >  0.50
|   |   |   |--- amt <= 17.91
|   |   |   |   |--- class: 1.0
|   |   |   |--- amt >  17.91
|   |   |   |   |--- amt <= 18.78
|   |   |   |   |   |--- class: 0.0
|   |   |   |   |--- amt >  18.78
|   |   | 

In [None]:
# ✅ Evaluation metrics
print("\n=== EVALUASI ===")
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

# ✅ Output prediction results
print("=== PREDIKSI - Decision Tree ===")
print("Contoh 20 Prediksi:", y_pred_dt[:20])
print(f"\nWaktu Eksekusi Decision Tree: {waktu_dt:.4f} detik")


=== EVALUASI ===
[[9369  617]
 [  17   97]]
              precision    recall  f1-score   support

         0.0       1.00      0.94      0.97      9986
         1.0       0.14      0.85      0.23       114

    accuracy                           0.94     10100
   macro avg       0.57      0.89      0.60     10100
weighted avg       0.99      0.94      0.96     10100

=== PREDIKSI - Decision Tree ===
Contoh 20 Prediksi: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]

Waktu Eksekusi Decision Tree: 0.2927 detik


In [None]:
# ✅ Train Naïve Bayes
start_nb = time.time()

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

end_nb = time.time()
waktu_nb = end_nb - start_nb

In [None]:
# ✅ Output prediction
print("\n=== PREDIKSI - Naïve Bayes ===")
print("Contoh 20 Prediksi:", y_pred_nb[:20])
print(f"\nWaktu Eksekusi Naïve Bayes: {waktu_nb:.4f} detik")


=== PREDIKSI - Naïve Bayes ===
Contoh 20 Prediksi: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

Waktu Eksekusi Naïve Bayes: 0.0352 detik


In [None]:
# ✅ Evaluation
print("\n=== EVALUASI - Naïve Bayes ===")
print(confusion_matrix(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


=== EVALUASI - Naïve Bayes ===
[[9829  157]
 [  53   61]]
              precision    recall  f1-score   support

         0.0       0.99      0.98      0.99      9986
         1.0       0.28      0.54      0.37       114

    accuracy                           0.98     10100
   macro avg       0.64      0.76      0.68     10100
weighted avg       0.99      0.98      0.98     10100

