# Modelling (Perbandingan Metode & Learning Curve)

In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt

In [None]:
data_oke = pd.read_csv("Data Gabungan Hasil Preprocessing Tahap 2 (3 Profesi).csv")
data_oke.tail()

# K-Fold Cross Validation

In [None]:
# Kalau pakai k-fold cross validation berarti langsung pakai semua data
X = data_oke.drop("median_gaji", axis = 1)
y = data_oke["median_gaji"]

In [None]:
len(y)

# Modelling

### A. Pakai Data Asli (Tanpa Resampling) -> Untuk Perbandingan 3 Metode

### 1. Decision Tree

In [None]:
len(X)
len(y)

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor(random_state = 0)

# PAKAI CROSS VAL
from sklearn.model_selection import cross_val_score
r2_tree = cross_val_score(estimator = tree, X = X, y = y, cv = 5)

In [None]:
r2_tree.mean() #-0.23937415266659418

In [None]:
from sklearn.model_selection import GridSearchCV

tree_x = DecisionTreeRegressor(random_state = 0)
param_grid = {"max_depth": list(range(11)), 
              "criterion": ["mse", "friedman_mse", "mae"], 
              "max_features": ["auto", "sqrt", "log2"], 
              "min_samples_split": list(range(6))}

grid_search_x = GridSearchCV(tree_x, param_grid, n_jobs = 2, verbose = 1, cv = 5)

grid_search_x.fit(X, y)

In [None]:
grid_search_x.best_params_

In [None]:
# Ini nilai score test (R2)-nya
grid_search_x.best_score_ # Mean cross-validated score of the best_estimator

In [None]:
# COBA BUAT MODEL BERDASARKAN "best_params_"
tree_x1 = DecisionTreeRegressor(random_state = 0, 
                                criterion = 'mae',
                                max_depth = 2,
                                max_features = 'sqrt',
                                min_samples_split = 2)

In [None]:
# PAKAI CROSS VAL
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute

r2_tree_x1 = cross_val_score(estimator = tree_x1, X = X, y = y, cv = 5, scoring  ='r2')
nrmse_tree_x1 = cross_val_score(estimator = tree_x1, X = X, y = y, cv = 5, scoring  ='neg_root_mean_squared_error')
nmape_tree_x1 = cross_val_score(estimator = tree_x1, X = X, y = y, cv = 5, scoring  ='neg_mean_absolute_percentage_error')

r2_tree_x1 = mean(r2_tree_x1)
rmse_tree_x1 = mean(absolute(nrmse_tree_x1))
mape_tree_x1 = mean(absolute(nmape_tree_x1))

In [None]:
print("R2 TREE:", r2_tree_x1)
print("RMSE TREE:", rmse_tree_x1)
print("MAPE TREE:", mape_tree_x1)

### 2. Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

random_forest = RandomForestRegressor(random_state = 0)

# PAKAI CROSS VAL
from sklearn.model_selection import cross_val_score
r2_rf = cross_val_score(estimator = random_forest, X = X, y = y, cv = 5)

In [None]:
r2_rf.mean()

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

random_forest_x = RandomForestRegressor(random_state = 0)
param_grid = {"n_estimators": list(range(100, 131)),
              "criterion": ["squared_error", "absolute_error"],
              "max_depth": list(range(11)), 
              "max_features": ["auto", "sqrt", "log2"], 
              "min_samples_split": list(range(2, 16))}

# grid_search_x1 = GridSearchCV(random_forest_x, param_grid, n_jobs = 2, verbose = 1, cv = 5)
# grid_search_x1.fit(X, y)

random_search_x = RandomizedSearchCV(random_forest_x, param_grid, cv = 5, n_jobs = 2, verbose = 1, 
                                     n_iter = 1000, random_state = 0)
# Harusnya kandidatnya ada banyak, tapi n_iter = 1000 jadi cuma pilih 1000 saja
random_search_x.fit(X, y)

In [None]:
random_search_x.best_params_

In [None]:
random_search_x.best_score_

In [None]:
# COBA BUAT MODEL BERDASARKAN "best_params_"
random_forest_x1 = RandomForestRegressor(random_state = 0, 
                                         n_estimators = 127,
                                         min_samples_split = 11,
                                         max_features = 'auto',
                                         max_depth = 3,
                                         criterion = 'absolute_error')

In [None]:
# PAKAI CROSS VAL
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute

r2_rf_x1 = cross_val_score(estimator = random_forest_x1, X = X, y = y, cv = 5, scoring  ='r2')
nrmse_rf_x1 = cross_val_score(estimator = random_forest_x1, X = X, y = y, cv = 5, scoring  ='neg_root_mean_squared_error')
nmape_rf_x1 = cross_val_score(estimator = random_forest_x1, X = X, y = y, cv = 5, scoring  ='neg_mean_absolute_percentage_error')

r2_rf_x1 = mean(r2_rf_x1)
rmse_rf_x1 = mean(absolute(nrmse_rf_x1))
mape_rf_x1 = mean(absolute(nmape_rf_x1))

In [None]:
print("R2 RANDOM FOREST:", r2_rf_x1)
print("RMSE RANDOM FOREST:", rmse_rf_x1)
print("MAPE RANDOM FOREST:", mape_rf_x1)

### 3. Support Vector Regression (SVR)

In [None]:
from sklearn.svm import SVR

svr = SVR()

# PAKAI CROSS VAL
from sklearn.model_selection import cross_val_score
r2_svr = cross_val_score(estimator = svr, X = X, y = y, cv = 5)

In [None]:
r2_svr.mean()

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

svr_x = SVR()
param_grid = {"gamma": np.logspace(-2, 2, 20), 
              "C": np.logspace(-2, 2, 20)}

grid_search_x2 = GridSearchCV(svr_x, param_grid, n_jobs = 2, verbose = 1, cv = 5)
grid_search_x2.fit(X, y)

In [None]:
grid_search_x2.best_params_

In [None]:
grid_search_x2.best_score_

In [None]:
# COBA BUAT MODEL BERDASARKAN "best_params_"
svr_x1 = SVR(C = 100.0, 
             gamma = 0.11288378916846889)

In [None]:
# PAKAI CROSS VAL
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute

r2_svr_x1 = cross_val_score(estimator = svr_x1, X = X, y = y, cv = 5, scoring  ='r2')
nrmse_svr_x1 = cross_val_score(estimator = svr_x1, X = X, y = y, cv = 5, scoring  ='neg_root_mean_squared_error')
nmape_svr_x1 = cross_val_score(estimator = svr_x1, X = X, y = y, cv = 5, scoring  ='neg_mean_absolute_percentage_error')

r2_svr_x1 = mean(r2_svr_x1)
rmse_svr_x1 = mean(absolute(nrmse_svr_x1))
mape_svr_x1 = mean(absolute(nmape_svr_x1))

In [None]:
print("R2 SVR:", r2_svr_x1)
print("RMSE SVR:", rmse_svr_x1)
print("MAPE SVR:", mape_svr_x1)

### B. Random Oversampling (Learning Curve) -> Bandingkannya dengan Model Random Forest Hasil Tuning

In [None]:
# Coba liHat dengan berbagai jumlah sampel pada oversampling

def learning_curve(data, observations = range(37, 101)):
    r2_rf = []
    rmse_rf = []
    mape_rf = []
    
    for n in observations:
        if n == 37:
            s1 = data[data["job_name"] == "data analyst"]
            s2 = data[data["job_name"] == "data engineer"].sample(n, replace = True, random_state = 0)
            s3 = data[data["job_name"] == "data scientist"].sample(n, replace = True, random_state = 0)
        else:
            s1 = data[data["job_name"] == "data analyst"].sample(n, replace = True, random_state = 0)
            s2 = data[data["job_name"] == "data engineer"].sample(n, replace = True, random_state = 0)
            s3 = data[data["job_name"] == "data scientist"].sample(n, replace = True, random_state = 0)

        data_resampling = pd.concat([s1, s2, s3])
        data_oke = data_resampling.copy()
        
        #--------------- Handling Categorical Data
        
        # NOMINAL
        # "job_name", "lokasi", "industri"
        nominal_cols = data_oke[["job_name", "lokasi", "industri"]]
        encoded_nominal = pd.get_dummies(data = nominal_cols)
        # Gabungkan ke data asli
        data_oke = pd.concat(objs = [encoded_nominal, data_oke], axis = 1)
        data_oke.drop(nominal_cols, axis = 1, inplace = True)
        
        # ORDINAL
        #import category_encoders as ce
        # 1. "tingkat_job"
        encoder_tingkat_job = ce.OrdinalEncoder(cols = ["tingkat_job"], return_df = True, 
                                                mapping = [{"col": "tingkat_job", 
                                                            "mapping": {"magang": 0, "tingkat pemula": 1, "asosiasi": 2, 
                                                                        "senior tingkat menengah": 3, "direktur": 4, "eksekutif": 5}}])
        data_oke["tingkat_job"] = encoder_tingkat_job.fit_transform(data_oke["tingkat_job"])
        # 2. "ukuran_company"
        encoder_ukuran_company = ce.OrdinalEncoder(cols = ["ukuran_company"], return_df = True, 
                                                   mapping = [{"col": "ukuran_company", 
                                                               "mapping": {"1-50 pekerja": 0, "51-200 pekerja": 1, "201-500 pekerja": 2,
                                                                           "501-1.000 pekerja": 3, "1.001-5.000 pekerja": 4, ">5.000 pekerja": 5}}])
        data_oke["ukuran_company"] = encoder_ukuran_company.fit_transform(data_oke["ukuran_company"])

        #--------------- Target Engineering
        
        #from scipy import stats
        #import matplotlib.pyplot as plt
        #import seaborn as sns
        
        #perform Box-Cox transformation on original data
        transformed_data, best_lambda = stats.boxcox(data_oke["median_gaji"]) 
        # Transformasikan
        data_oke["median_gaji"] = transformed_data
        
        #--------------- K-Fold Cross Validation
        
        # Kalau pakai k-fold cross validation berarti langsung pakai semua data
        X = data_oke.drop("median_gaji", axis = 1)
        y = data_oke["median_gaji"]
    
        #--------------- Modelling Random Forest
        #from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
        
        random_forest_x1 = RandomForestRegressor(random_state = 0, 
                                         n_estimators = 127,
                                         min_samples_split = 11,
                                         max_features = 'auto',
                                         max_depth = 3,
                                         criterion = 'absolute_error')
        
        r2_rf_x1 = cross_val_score(estimator = random_forest_x1, X = X, y = y, cv = 5, scoring  ='r2')
        nrmse_rf_x1 = cross_val_score(estimator = random_forest_x1, X = X, y = y, cv = 5, scoring  ='neg_root_mean_squared_error')
        nmape_rf_x1 = cross_val_score(estimator = random_forest_x1, X = X, y = y, cv = 5, scoring  ='neg_mean_absolute_percentage_error')

        r2_rf_x1 = mean(r2_rf_x1)
        rmse_rf_x1 = mean(absolute(nrmse_rf_x1))
        mape_rf_x1 = mean(absolute(nmape_rf_x1))
        
        r2_rf.append(r2_rf_x1)
        rmse_rf.append(rmse_rf_x1)
        mape_rf.append(mape_rf_x1)
        
        df = pd.DataFrame(list(zip(r2_rf, rmse_rf,  mape_rf)), columns = ["r2_rf", "rmse_rf", "mape_rf"])
        
    return df

In [None]:
len(data)

In [None]:
from datetime import datetime
start_time = datetime.now()

score_lc = learning_curve(data, observations = range(37, 201))

end_time = datetime.now()
print("Durasi Learning Curve: {}".format(end_time - start_time))

In [None]:
score_lc

In [None]:
# Grafik R2
plt.plot(range(37, 201), score_lc["r2_rf"], linewidth = 4)
plt.title("Over Sampling Learning Curve (R2)", fontsize = 16)
plt.gca().set_xlabel("# of Points per Class", fontsize = 14)
plt.gca().set_ylabel("Cross Val R2", fontsize = 14)
sns.despine()
pass

In [None]:
# Grafik RMSE
plt.plot(range(37, 201), score_lc["rmse_rf"], linewidth = 4)
plt.title("Over Sampling Learning Curve (RMSE)", fontsize = 16)
plt.gca().set_xlabel("# of Points per Class", fontsize = 14)
plt.gca().set_ylabel("Cross Val RMSE", fontsize = 14)
sns.despine()
pass

In [None]:
# Grafik MAPE
plt.plot(range(37, 201), score_lc["mape_rf"], linewidth = 4)
plt.title("Over Sampling Learning Curve (MAPE)", fontsize = 16)
plt.gca().set_xlabel("# of Points per Class", fontsize = 14)
plt.gca().set_ylabel("Cross Val MAPE", fontsize = 14)
sns.despine()
pass

In [None]:
# Dari grafik R2, RMSE, MAPE, mugkin bisa diambil resampling antara 37 s.d. 105 saja

In [None]:
plt.plot(range(37, 106), score_lc[0:69]["r2_rf"], linewidth = 4)
plt.title("Over Sampling Learning Curve (R2)", fontsize = 16)
plt.gca().set_xlabel("# of Points per Class", fontsize = 14)
plt.gca().set_ylabel("Cross Val R2", fontsize = 14)
sns.despine()
pass

# Coba pakai 100 saja

In [None]:
plt.plot(range(37, 106), score_lc[0:69]["rmse_rf"], linewidth = 4)
plt.title("Over Sampling Learning Curve (RMSE)", fontsize = 16)
plt.gca().set_xlabel("# of Points per Class", fontsize = 14)
plt.gca().set_ylabel("Cross Val RMSE", fontsize = 14)
sns.despine()
pass

# Coba pakai 100 saja

In [None]:
plt.plot(range(37, 106), score_lc[0:69]["mape_rf"], linewidth = 4)
plt.title("Over Sampling Learning Curve (MAPE)", fontsize = 16)
plt.gca().set_xlabel("# of Points per Class", fontsize = 14)
plt.gca().set_ylabel("Cross Val MAPE", fontsize = 14)
sns.despine()
pass

# Coba pakai 100 saja

In [None]:
# InsyaAlloh oversampling-nya pakai n = 100