In [1]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
import joblib
from sklearn.inspection import permutation_importance



## Predictions Survival Rate

In [2]:
# Load data
df = pd.read_csv('clean_new.csv')

# Find categorical columns
categorical_columns = df.select_dtypes(include=["object", "category"]).columns.tolist()
print("Categorical Columns:", categorical_columns)

# Identify and remove columns that contain 'at' in their name (assuming these are date columns)
date_columns = [col for col in df.columns if '_at' in col.lower()]
additional_columns_to_remove = [
    "id", "start_month", "start_year", "month", "z_score", "pond_id_measurements",
    "id_feed_tray", "id_mortalities", "farm_id", "id_farm", "pond_id", 
    "species_id", "record_id", "id_pond", "id_harvests", "cycle_id", "hatchery_id", "logged_date"
]

columns_to_remove = list(set(date_columns + additional_columns_to_remove))  # Ensure unique column names

df.drop(columns=columns_to_remove, inplace=True, errors='ignore')  # 'errors=ignore' prevents issues if a column is missing
print("Dropped Columns:", columns_to_remove)

# Update categorical columns (some may have been removed)
categorical_columns = [col for col in categorical_columns if col in df.columns]

# Apply Label Encoding
df_encoded = df.copy()
label_encoders = {}

for col in categorical_columns:
    encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    df_encoded[categorical_columns] = encoder.fit_transform(df_encoded[categorical_columns])

print("Encoding completed!")


Categorical Columns: ['province', 'regency', 'timezone', 'created_at_pond', 'updated_at_pond', 'record_id', 'extracted_at_pond', 'started_at', 'finished_at', 'remark', 'created_at_cycle', 'updated_at_cycle', 'extracted_at_cycle', 'subscription_type', 'ordered_at', 'total_seed_type', 'hatchery_name', 'pond_name', 'logged_at', 'logged_date', 'updated_at', 'sampled_at', 'created_at', 'remark_samplings', 'measured_date', 'recorded_at', 'created_at_mortalities', 'updated_at_mortalities', 'logged_at_feed_tray', 'feed_logged_at', 'remark_feed_tray', 'created_at_feed_tray', 'updated_at_feed_tray', 'local_feed_logged_at', 'updated_at_harvests', 'created_at_harvests', 'harvested_at', 'status']
Dropped Columns: ['start_month', 'updated_at_cycle', 'sampled_at', 'pond_id_measurements', 'month', 'updated_at', 'start_year', 'id_farm', 'pond_id', 'updated_at_pond', 'id_pond', 'local_feed_logged_at', 'id', 'feed_logged_at', 'record_id', 'updated_at_mortalities', 'id_harvests', 'cycle_id', 'farm_id', 'c

In [3]:
# Clean dataset
df_select = df_encoded.drop_duplicates().dropna()

# Split data into features (X) and target (y)
X = df_select.drop(columns=["survival_rate"])
y = df_select["survival_rate"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)


In [4]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

perm_importance = permutation_importance(rf, X_test, y_test, scoring="neg_mean_absolute_error", n_repeats=10)
perm_df = pd.DataFrame({"Feature": X_train.columns, "Importance": perm_importance.importances_mean})
perm_df = perm_df.sort_values(by="Importance", ascending=False)

# Keep features where importance > 0.1
selected_perm_features = perm_df[perm_df["Importance"] > 0.07]["Feature"].tolist()
print("Selected Permutation Features:", selected_perm_features)


Selected Permutation Features: ['weight', 'total_seed', 'adg', 'selling_price', 'morning_do']


In [5]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Create new feature-selected datasets
X_train_selected = X_train[selected_perm_features]
X_test_selected = X_test[selected_perm_features]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# SVR dengan data yang telah di-scale
svr = SVR(kernel="rbf", C=100, gamma=0.1, epsilon=0.1)
svr.fit(X_train_scaled, y_train)
svr_preds = svr.predict(X_test_scaled)

In [6]:
# Train models
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    "SVR": SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
}

results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    
    results[name] = {"MAE": mae, "RMSE": rmse, "R²": r2}
    
    print(f"{name} - MAE: {mae:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")

Random Forest - MAE: 5.5643, RMSE: 15.6976, R²: -0.1375
XGBoost - MAE: 3.1636, RMSE: 6.0960, R²: 0.8285
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000042 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 162
[LightGBM] [Info] Number of data points in the train set: 176, number of used features: 5
[LightGBM] [Info] Start training from score 41.332669
LightGBM - MAE: 40.0737, RMSE: 69.9732, R²: -21.6021
SVR - MAE: 3.5331, RMSE: 7.1269, R²: 0.7655




In [7]:
# Find the best-performing model
best_model_name = min(results, key=lambda k: results[k]["RMSE"])
best_model = models[best_model_name]

print(f"\nBest Model: {best_model_name} with RMSE: {results[best_model_name]['RMSE']:.4f}")


Best Model: XGBoost with RMSE: 6.0960


In [8]:
# Simpan model terbaik
model_filename = f"{best_model_name.lower().replace(' ', '_')}_model_sr.pkl"
joblib.dump(best_model, model_filename)

# Simpan scaler agar bisa digunakan kembali dalam API
joblib.dump(scaler, "scaler_sr.pkl")

print(f"Model disimpan sebagai {model_filename}")

Model disimpan sebagai xgboost_model_sr.pkl


## Predictions Average Body Weight (ABW)

In [9]:
# Split data into features (X) and target (y)
X_2 = df_select.drop(columns=["average_weight"])
y_2 = df_select["average_weight"]

# Train-test split
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.2, shuffle=True, random_state=42)

In [10]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_2, y_train_2)

perm_importance_2 = permutation_importance(rf, X_test_2, y_test_2, scoring="neg_mean_absolute_error", n_repeats=10)
perm_df_2 = pd.DataFrame({"Feature": X_train_2.columns, "Importance": perm_importance_2.importances_mean})
perm_df_2 = perm_df_2.sort_values(by="Importance", ascending=False)

# Keep features where importance > 0.1
selected_perm_features_2 = perm_df_2[perm_df_2["Importance"] > 0.05]["Feature"].tolist()
print("Selected Permutation Features:", selected_perm_features_2)

Selected Permutation Features: ['fcr', 'quantity', 'adg']


In [11]:
# Create new feature-selected datasets
X_train_selected_2 = X_train[selected_perm_features_2]
X_test_selected_2 = X_test[selected_perm_features_2]

X_train_scaled_2 = scaler.fit_transform(X_train_selected_2)
X_test_scaled_2 = scaler.transform(X_test_selected_2)

In [12]:
# Train models
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    "SVR": SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
}

results_2 = {}

for name, model in models.items():
    model.fit(X_train_scaled_2, y_train_2)
    preds_2 = model.predict(X_test_scaled_2)
    
    mae_2 = mean_absolute_error(y_test_2, preds_2)
    rmse_2 = np.sqrt(mean_squared_error(y_test_2, preds_2))
    r2_2 = r2_score(y_test_2, preds_2)
    
    results_2[name] = {"MAE": mae_2, "RMSE": rmse_2, "R²": r2_2}
    
    print(f"{name} - MAE: {mae_2:.4f}, RMSE: {rmse_2:.4f}, R²: {r2_2:.4f}")

Random Forest - MAE: 1.7816, RMSE: 3.2006, R²: 0.8126
XGBoost - MAE: 1.6948, RMSE: 2.7959, R²: 0.8570
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000033 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 142
[LightGBM] [Info] Number of data points in the train set: 176, number of used features: 3
[LightGBM] [Info] Start training from score 13.608295
LightGBM - MAE: 2.7238, RMSE: 4.4273, R²: 0.6414
SVR - MAE: 2.6019, RMSE: 4.0981, R²: 0.6928




In [13]:
# Find the best-performing model
best_model_name_2 = min(results_2, key=lambda k: results_2[k]["RMSE"])
best_model_2 = models[best_model_name_2]

print(f"\nBest Model: {best_model_name_2} with RMSE: {results_2[best_model_name_2]['RMSE']:.4f}")


Best Model: XGBoost with RMSE: 2.7959


In [14]:
# Simpan model terbaik
model_filename_2 = f"{best_model_name_2.lower().replace(' ', '_')}_model_awb.pkl"
joblib.dump(best_model_2, model_filename_2)

# Simpan scaler agar bisa digunakan kembali dalam API
joblib.dump(scaler, "scaler_awb.pkl")

print(f"Model disimpan sebagai {model_filename_2}")

Model disimpan sebagai xgboost_model_awb.pkl
