In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import os

# Memastikan output konsisten
np.random.seed(42)
tf.random.set_seed(42)

In [2]:
print("Langkah 1: Memuat Data...")
try:
    df = pd.read_csv('dataset_investasi_large2.csv')
    print(f"Data berhasil dimuat. Jumlah baris awal: {len(df)}")
except FileNotFoundError:
    print("\nERROR: File 'dataset_investasi_large2.csv' tidak ditemukan.")
    print("Pastikan file CSV berada di folder yang sama dengan script ini.")
    exit()

Langkah 1: Memuat Data...
Data berhasil dimuat. Jumlah baris awal: 79776


In [3]:
df.head()

Unnamed: 0,user_id,usia,profil_risiko,pendapatan_bulanan_juta,tingkat_pengetahuan,status_pernikahan,jumlah_tanggungan,tujuan_keuangan,jangka_waktu_thn,target_dana_juta,produk_id,nama_produk,jenis_produk,tingkat_risiko_skor,potensi_return_tahunan_persen,likuiditas,minimum_investasi_rp,rekomendasi
0,user_00001,50,Moderat,11,Menengah,Lajang,0,Dana Beli Rumah,6.3,431,P01,Reksa Dana Pasar Uang Amanah,Reksa Dana Pasar Uang,1,4.5,Sangat Tinggi,10000,1
1,user_00001,50,Moderat,11,Menengah,Lajang,0,Dana Beli Rumah,6.3,431,P02,Tabungan Emas Digital,Emas Digital,2,5.0,Tinggi,10000,1
2,user_00001,50,Moderat,11,Menengah,Lajang,0,Dana Beli Rumah,6.3,431,P03,SBN Ritel ORI025,SBN Ritel,2,6.25,Rendah,1000000,1
3,user_00001,50,Moderat,11,Menengah,Lajang,0,Dana Beli Rumah,6.3,431,P04,Reksa Dana Pendapatan Tetap Stabil,Reksa Dana Pendapatan Tetap,3,7.0,Sedang,100000,1
4,user_00001,50,Moderat,11,Menengah,Lajang,0,Dana Beli Rumah,6.3,431,P05,Reksa Dana Campuran Seimbang,Reksa Dana Campuran,5,10.0,Sedang,100000,0


In [4]:
print("[INFO] Mengecek missing values...")
if df.isnull().sum().sum() > 0:
    print(f"Ditemukan missing values. Jumlahnya:\n{df.isnull().sum()[df.isnull().sum() > 0]}")
    # Untuk contoh ini, kita pilih strategi menghapus baris yang mengandung null
    # Untuk kasus riil, pertimbangkan strategi imputasi (mengisi nilai)
    df.dropna(inplace=True)
    print("-> Baris dengan missing values telah dihapus.")
else:
    print("-> Tidak ada missing values ditemukan.")

[INFO] Mengecek missing values...
-> Tidak ada missing values ditemukan.


In [5]:
print("\n[INFO] Mengecek data duplikat...")
duplicate_count = df.duplicated().sum()
if duplicate_count > 0:
    print(f"-> Ditemukan {duplicate_count} baris data duplikat. Menghapus...")
    df.drop_duplicates(inplace=True)
else:
    print("-> Tidak ada data duplikat ditemukan.")


[INFO] Mengecek data duplikat...
-> Tidak ada data duplikat ditemukan.


In [6]:
# Reset index setelah cleaning agar urutannya kembali normal
df.reset_index(drop=True, inplace=True)
print(f"\nData Cleaning Selesai. Jumlah baris setelah cleaning: {len(df)}")


Data Cleaning Selesai. Jumlah baris setelah cleaning: 79776


In [7]:
print("\nLangkah 2: Pra-pemrosesan Data...")

# Pisahkan fitur dan target
X = df.drop('rekomendasi', axis=1)
y = df['rekomendasi']


Langkah 2: Pra-pemrosesan Data...


In [8]:
# Identifikasi fitur yang tidak relevan untuk training
features_to_drop = ['user_id', 'produk_id', 'nama_produk']
X = X.drop(columns=features_to_drop)

In [9]:
# Identifikasi kolom numerik dan kategorikal
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include='object').columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
# Buat pipeline preprocessing
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

In [11]:
# Terapkan preprocessor
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Simpan preprocessor
joblib.dump(preprocessor, 'investment_preprocessor.joblib')
print("Preprocessor berhasil disimpan sebagai 'investment_preprocessor.joblib'")

Preprocessor berhasil disimpan sebagai 'investment_preprocessor.joblib'


In [12]:
print("\nLangkah 3: Membangun Model TensorFlow...")

input_shape = X_train_processed.shape[1]
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(input_shape,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)
model.summary()


Langkah 3: Membangun Model TensorFlow...


In [None]:
print("\nLangkah 4: Pelatihan Model...")

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = model.fit(
    X_train_processed,
    y_train,
    epochs=100,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)


Langkah 4: Pelatihan Model...
Epoch 1/100
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.8489 - loss: 0.3463 - val_accuracy: 0.9591 - val_loss: 0.1304
Epoch 2/100
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9577 - loss: 0.1403 - val_accuracy: 0.9607 - val_loss: 0.1286
Epoch 3/100
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9595 - loss: 0.1344 - val_accuracy: 0.9610 - val_loss: 0.1274
Epoch 4/100
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9604 - loss: 0.1318 - val_accuracy: 0.9621 - val_loss: 0.1273
Epoch 5/100
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9608 - loss: 0.1310 - val_accuracy: 0.9616 - val_loss: 0.1273
Epoch 6/100
[1m798/798[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9604 - loss: 0.1309 - val_accuracy: 0.9620 - val_loss

In [None]:
print("\nLangkah 5: Evaluasi Model...")
def plot_history(history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

    # Plot Grafik Loss
    ax1.plot(history.history['loss'], label='Training Loss')
    ax1.plot(history.history['val_loss'], label='Validation Loss')
    ax1.set_title('Grafik Loss Model')
    ax1.set_ylabel('Loss')
    ax1.set_xlabel('Epoch')
    ax1.legend(loc='upper right')

    # Plot Grafik Akurasi
    ax2.plot(history.history['accuracy'], label='Training Accuracy')
    ax2.plot(history.history['val_accuracy'], label='Validation Accuracy')
    ax2.set_title('Grafik Akurasi Model')
    ax2.set_ylabel('Akurasi')
    ax2.set_xlabel('Epoch')
    ax2.legend(loc='lower right')

    plt.show()

plot_history(history)


In [None]:
loss, accuracy = model.evaluate(X_test_processed, y_test, verbose=0)
print(f"\nAkurasi pada data test: {accuracy:.4f}")
print(f"Loss pada data test: {loss:.4f}")

In [None]:
y_pred_proba = model.predict(X_test_processed)
y_pred = (y_pred_proba > 0.5).astype("int32")

print("\nLaporan Klasifikasi:")
print(classification_report(y_test, y_pred, target_names=['Tidak Direkomendasikan', 'Direkomendasikan']))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Tidak', 'Ya'], yticklabels=['Tidak', 'Ya'])
plt.xlabel('Prediksi Rekomendasi')
plt.ylabel('Rekomendasi Sebenarnya')
plt.title('Confusion Matrix')
plt.show()

In [None]:
print("\nLangkah 6: Menyimpan Model dalam 3 Format...")

# Format 1: .keras (Modern & Direkomendasikan)
model.save('investment_recommendation_model.keras')
print("-> Model berhasil disimpan sebagai 'investment_recommendation_model.keras'")

# Format 2: .h5 (Legacy HDF5)
model.save('investment_recommendation_model.h5')
print("-> Model berhasil disimpan sebagai 'investment_recommendation_model.h5'")

# Format 3: SavedModel (Direktori untuk Deployment)
# Gunakan model.export() untuk menyimpan dalam format SavedModel
model.export('investment_recommendation_saved_model')
print("-> Model berhasil disimpan sebagai direktori 'investment_recommendation_saved_model'")

In [None]:
print("\nLangkah 7: Contoh Inferensi pada Data Baru...")

# Muat kembali preprocessor dan model (format .keras sebagai contoh)
loaded_preprocessor = joblib.load('investment_preprocessor.joblib')
loaded_model = tf.keras.models.load_model('investment_recommendation_model.keras')
print("Preprocessor dan Model (.keras) berhasil dimuat untuk inferensi.")

# Buat contoh data baru
new_data = pd.DataFrame({
    'usia': [30],
    'profil_risiko': ['Agresif'],
    'pendapatan_bulanan_juta': [25],
    'tingkat_pengetahuan': ['Menengah'],
    'status_pernikahan': ['Menikah'],
    'jumlah_tanggungan': [1],
    'tujuan_keuangan': ['Dana Pensiun'],
    'jangka_waktu_thn': [25.0],
    'target_dana_juta': [1500],
    'jenis_produk': ['Reksa Dana Indeks Saham'],
    'tingkat_risiko_skor': [8],
    'potensi_return_tahunan_persen': [18.0],
    'likuiditas': ['Sedang'],
    'minimum_investasi_rp': [100000],
})

# Preprocess data baru
new_data_processed = loaded_preprocessor.transform(new_data)

# Lakukan prediksi
predictions_proba = loaded_model.predict(new_data_processed)
predictions = (predictions_proba > 0.5).astype("int32")

# Tampilkan hasil
print("\n--- Hasil Rekomendasi ---")
for i in range(len(new_data)):
    product_name = new_data['jenis_produk'].iloc[i]
    probability = predictions_proba[i][0]
    result = "COCOK" if predictions[i][0] == 1 else "TIDAK COCOK"

    print(f"Produk: {product_name}")
    print(f"  -> Probabilitas Kecocokan: {probability:.2%}")
    print(f"  -> Rekomendasi: **{result}**\n")

In [None]:
!pip freeze > requirements.txt