# Modelling (Perbandingan Metode & Learning Curve SMOTE)

In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt

In [None]:
data_oke = pd.read_csv("Data Gabungan Hasil Preprocessing Tahap 2 (3 Profesi).csv")
data_oke.tail()

# K-Fold Cross Validation

In [None]:
# Kalau pakai k-fold cross validation berarti langsung pakai semua data
X = data_oke.drop("median_gaji", axis = 1)
y = data_oke["median_gaji"]

In [None]:
len(y)

# Modelling

### A. Pakai Data Asli (Tanpa Resampling) -> Untuk Perbandingan 3 Metode

### 1. Decision Tree

In [None]:
len(X)
len(y)

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor(random_state = 0)

# PAKAI CROSS VAL
from sklearn.model_selection import cross_val_score
r2_tree = cross_val_score(estimator = tree, X = X, y = y, cv = 5)

In [None]:
r2_tree.mean()

In [None]:
from sklearn.model_selection import GridSearchCV

tree_x = DecisionTreeRegressor(random_state = 0)
param_grid = {"max_depth": list(range(11)), 
              "criterion": ["mse", "friedman_mse", "mae"], 
              "max_features": ["auto", "sqrt", "log2"], 
              "min_samples_split": list(range(6))}

grid_search_x = GridSearchCV(tree_x, param_grid, n_jobs = 2, verbose = 1, cv = 5)

grid_search_x.fit(X, y)

In [None]:
grid_search_x.best_params_

In [None]:
# Ini nilai score test (R2)-nya
grid_search_x.best_score_ # Mean cross-validated score of the best_estimator

In [None]:
# COBA BUAT MODEL BERDASARKAN "best_params_"
tree_x1 = DecisionTreeRegressor(random_state = 0, 
                                criterion = 'mae',
                                max_depth = 2,
                                max_features = 'sqrt',
                                min_samples_split = 2)

In [None]:
# PAKAI CROSS VAL
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute

r2_tree_x1 = cross_val_score(estimator = tree_x1, X = X, y = y, cv = 5, scoring  ='r2')
nrmse_tree_x1 = cross_val_score(estimator = tree_x1, X = X, y = y, cv = 5, scoring  ='neg_root_mean_squared_error')
nmape_tree_x1 = cross_val_score(estimator = tree_x1, X = X, y = y, cv = 5, scoring  ='neg_mean_absolute_percentage_error')

r2_tree_x1 = mean(r2_tree_x1)
rmse_tree_x1 = mean(absolute(nrmse_tree_x1))
mape_tree_x1 = mean(absolute(nmape_tree_x1))

In [None]:
print("R2 TREE:", r2_tree_x1)
print("RMSE TREE:", rmse_tree_x1)
print("MAPE TREE:", mape_tree_x1)

#### Menampilkan Grafik Decision Tree 

In [None]:
tree_model = grid_search_x.best_estimator_ # model ini sama dengan tree_x1

In [None]:
tree_model 

In [None]:
X.columns

In [None]:
from sklearn import tree

In [None]:
fig = plt.figure(figsize = (25, 20))
_ = tree.plot_tree(tree_model, feature_names = X.columns, filled = True)

In [None]:
fig.savefig("grafik decision tree (bagian perbandingan metode).png")

### 2. Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

random_forest = RandomForestRegressor(random_state = 0)

# PAKAI CROSS VAL
from sklearn.model_selection import cross_val_score
r2_rf = cross_val_score(estimator = random_forest, X = X, y = y, cv = 5)

In [None]:
r2_rf.mean()

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

random_forest_x = RandomForestRegressor(random_state = 0)
param_grid = {"n_estimators": list(range(100, 131)),
              "criterion": ["squared_error", "absolute_error"],
              "max_depth": list(range(11)), 
              "max_features": ["auto", "sqrt", "log2"], 
              "min_samples_split": list(range(2, 16))}

# grid_search_x1 = GridSearchCV(random_forest_x, param_grid, n_jobs = 2, verbose = 1, cv = 5)
# grid_search_x1.fit(X, y)

random_search_x = RandomizedSearchCV(random_forest_x, param_grid, cv = 5, n_jobs = 2, verbose = 1, 
                                     n_iter = 1000, random_state = 0)
# Harusnya kandidatnya ada banyak, tapi n_iter = 1000 jadi cuma pilih 1000 saja
random_search_x.fit(X, y)

In [None]:
random_search_x.best_params_

In [None]:
random_search_x.best_score_

In [None]:
# COBA BUAT MODEL BERDASARKAN "best_params_"
random_forest_x1 = RandomForestRegressor(random_state = 0, 
                                         n_estimators = 122,
                                         min_samples_split = 13,
                                         max_features = 'auto',
                                         max_depth = 2,
                                         criterion = 'squared_error')

In [None]:
# PAKAI CROSS VAL
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute

r2_rf_x1 = cross_val_score(estimator = random_forest_x1, X = X, y = y, cv = 5, scoring  ='r2')
nrmse_rf_x1 = cross_val_score(estimator = random_forest_x1, X = X, y = y, cv = 5, scoring  ='neg_root_mean_squared_error')
nmape_rf_x1 = cross_val_score(estimator = random_forest_x1, X = X, y = y, cv = 5, scoring  ='neg_mean_absolute_percentage_error')

r2_rf_x1 = mean(r2_rf_x1)
rmse_rf_x1 = mean(absolute(nrmse_rf_x1))
mape_rf_x1 = mean(absolute(nmape_rf_x1))

In [None]:
print("R2 RANDOM FOREST:", r2_rf_x1)
print("RMSE RANDOM FOREST:", rmse_rf_x1)
print("MAPE RANDOM FOREST:", mape_rf_x1)

#### Menampilkan Grafik Random Forest

In [None]:
rf_model = random_search_x.best_estimator_ # model ini sama dengan random_forest_x1

In [None]:
rf_model

In [None]:
rf_model.estimators_ # masing-masing DT-nya

In [None]:
len(rf_model.estimators_)

In [None]:
from sklearn import tree

In [None]:
fig = plt.figure(figsize = (25, 20))
_ = tree.plot_tree(rf_model.estimators_[1], feature_names = X.columns, filled = True)

In [None]:
fig.savefig("grafik random forest (bagian perbandingan metode)_1.png")

### 3. Support Vector Regression (SVR)

In [None]:
from sklearn.svm import SVR

svr = SVR()

# PAKAI CROSS VAL
from sklearn.model_selection import cross_val_score
r2_svr = cross_val_score(estimator = svr, X = X, y = y, cv = 5)

In [None]:
svr.fit(X, y)

In [None]:
r2_svr.mean()

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

svr_x = SVR()
param_grid = {"gamma": np.logspace(-2, 2, 20), 
              "C": np.logspace(-2, 2, 20)}

grid_search_x2 = GridSearchCV(svr_x, param_grid, n_jobs = 2, verbose = 1, cv = 5)
grid_search_x2.fit(X, y)

In [None]:
grid_search_x2.best_params_

In [None]:
grid_search_x2.best_score_

In [None]:
# COBA BUAT MODEL BERDASARKAN "best_params_"
svr_x1 = SVR(C = 100.0, 
             gamma = 0.11288378916846889)

In [None]:
# PAKAI CROSS VAL
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute

r2_svr_x1 = cross_val_score(estimator = svr_x1, X = X, y = y, cv = 5, scoring  ='r2')
nrmse_svr_x1 = cross_val_score(estimator = svr_x1, X = X, y = y, cv = 5, scoring  ='neg_root_mean_squared_error')
nmape_svr_x1 = cross_val_score(estimator = svr_x1, X = X, y = y, cv = 5, scoring  ='neg_mean_absolute_percentage_error')

r2_svr_x1 = mean(r2_svr_x1)
rmse_svr_x1 = mean(absolute(nrmse_svr_x1))
mape_svr_x1 = mean(absolute(nmape_svr_x1))

In [None]:
print("R2 SVR:", r2_svr_x1)
print("RMSE SVR:", rmse_svr_x1)
print("MAPE SVR:", mape_svr_x1)

In [None]:
svr_model = grid_search_x2.best_estimator_ # model ini sama dengan svr_x1

In [None]:
svr_model

In [None]:
print("intercept:", svr_model.intercept_)
print("dual_coef:", svr_model.dual_coef_)

### B. SMOTE (Learning Curve) -> Bandingkannya dengan Model Decision Tree Hasil Tuning

In [None]:
data = pd.read_csv("Data Gabungan dari Linkedin dan Jobstreet (3 Profesi).csv")
data.head(2)

In [None]:
len(data)

In [None]:
def learning_curve(data, oversample = range(37, 101)):
    r2_tree = []
    rmse_tree = []
    mape_tree = []
    
    # Hapus kolom yang tidak digunakan
    data.drop(["company", "gaji", "sumber"], axis = 1, inplace = True)

    # Hapus data/row yang "median_gaji"-nya berupa missing value
    data.dropna(subset = ["median_gaji"], inplace = True)

    data.reset_index(drop = True, inplace = True)
    
    # Hapus "jenis_job" karena korelasi <=0.25
    data.drop(["jenis_job"], axis = 1, inplace = True)

    #--------------- Handling Missing Value
    # 1. Menangani Missing Value di "ukuran_company"
    # Coba berdasarkan variabel -> "job_name"
    import statistics
    mode_ukuran_company1 = data[data["ukuran_company"].notna()].groupby(["job_name"])["ukuran_company"].apply(statistics.mode)

    dict_group1a = {}
    for indeks in mode_ukuran_company1.index:    
        group1a = {indeks: mode_ukuran_company1[indeks]}
        dict_group1a.update(group1a)

    missing_ukuran_company1 = data[pd.isna(data["ukuran_company"])]
    for i in missing_ukuran_company1.index:
        for j in dict_group1a.keys():
            data_asli = (data["job_name"][i])
            if data_asli == j:
                data["ukuran_company"][i] = dict_group1a[j]

    # 2. Menangani Missing Value di "industri"
    # Coba berdasarkan variabel -> "job_name"
    mode_industri1 = data[data["industri"].notna()].groupby(["job_name"])["industri"].apply(statistics.mode) # Pakainya modus karena data kategorik
    mode_industri1 
    # Sama semua sehingga ada yang akan diambil nilai modus kedua
    # "data analyst" pakai yang industri "konsultasi"
    
    # Coba impute menggunakan modus yang lain
    impute_industri = {"data analyst": "konsultasi", 
                       "data engineer": "teknologi informasi dan komunikasi", 
                       "data scientist": "teknologi informasi dan komunikasi"}

    missing_industri1 = data[pd.isna(data["industri"])]

    for i in missing_industri1.index:
        for j in impute_industri.keys():
            data_asli = (data["job_name"][i])

            if data_asli == j:
                data["industri"][i] = impute_industri[j]

    # 3. Menangani Missing Value di "lama_pengalaman"
    # Cek distribusi dari datanya terlebih dahulu
    filtered_lama_pengalaman = data["lama_pengalaman"][~np.isnan(data["lama_pengalaman"])]
    plt.boxplot(filtered_lama_pengalaman, meanline = True, showmeans = True)
    plt.title("Boxplot Lama Pengalaman")
    plt.ylabel("Lama Pengalaman (Tahun)")
    #plt.savefig("1. Boxplot Lama Pengalaman (Untuk Handling Missing Value) REVISI.png")
    plt.show()
    
    import seaborn as sns
    sns.distplot(filtered_lama_pengalaman, hist = False, kde = True, axlabel = "Lama Pengalaman (Tahun)")
    #plt.savefig("1. Distplot Lama Pengalaman (Untuk Handling Missing Value) REVISI.png")
    plt.show()
    # Distribusi cenderung Normal sehingga penggantinya akan menggunakan nilai mean
    
    # Coba berdasarkan variabel -> "job_name"
    mean_lama_pengalaman1 = data[data["lama_pengalaman"].notna()].groupby(["job_name"])["lama_pengalaman"].mean()

    dict_group4a = {}
    for indeks in mean_lama_pengalaman1.index:      
        group4a = {indeks: round(mean_lama_pengalaman1[indeks], 1)}
        dict_group4a.update(group4a)

    missing_lama_pengalaman1 = data[pd.isna(data["lama_pengalaman"])]
    for i in missing_lama_pengalaman1.index:
        for j in dict_group4a.keys():
            data_asli = (data["job_name"][i])
            if data_asli == j:
                data["lama_pengalaman"][i] = dict_group4a[j]

    #--------------- Handling Categorical Data
    data_oke = data.copy()

    # NOMINAL -> "job_name", "lokasi", "industri" (pakai get_dummies)
    # ORDINAL -> "tingkat_job", "ukuran_company" (pakai OrdinalEncoder)

    # 1. NOMINAL
    # "lokasi", "industri"
    nominal_cols = data_oke[["lokasi", "industri"]]
    encoded_nominal = pd.get_dummies(data = nominal_cols)
    # Gabungkan ke data asli
    data_oke = pd.concat(objs = [encoded_nominal, data_oke], axis = 1)
    data_oke.drop(nominal_cols, axis = 1, inplace = True)

    # 2. ORDINAL
    import category_encoders as ce

    # 1) "tingkat_job"
    encoder_tingkat_job = ce.OrdinalEncoder(cols = ["tingkat_job"], return_df = True, 
                                            mapping = [{"col": "tingkat_job", 
                                                        "mapping": {"magang": 0, "tingkat pemula": 1, "asosiasi": 2, 
                                                                    "senior tingkat menengah": 3, "direktur": 4, "eksekutif": 5}}])
    data_oke["tingkat_job"] = encoder_tingkat_job.fit_transform(data_oke["tingkat_job"])

    # 2) "ukuran_company"
    encoder_ukuran_company = ce.OrdinalEncoder(cols = ["ukuran_company"], return_df = True, 
                                               mapping = [{"col": "ukuran_company", 
                                                           "mapping": {"1-50 pekerja": 0, "51-200 pekerja": 1, "201-500 pekerja": 2,
                                                                       "501-1.000 pekerja": 3, "1.001-5.000 pekerja": 4, ">5.000 pekerja": 5}}])
    data_oke["ukuran_company"] = encoder_ukuran_company.fit_transform(data_oke["ukuran_company"])

    for n in oversample:
        
        #--------------- SMOTE Pakai Semua Data
        # Karena mau pakai SMOTE, yang jadi y itu nama profesi data dulu
        X = data_oke.drop("job_name", axis = 1)
        y = data_oke["job_name"]

        from imblearn.over_sampling import SMOTE
        strategy = {"data analyst": n, "data engineer": n, "data scientist": n}
        oversample = SMOTE(random_state = 0, sampling_strategy = strategy)
        X_smote, y_smote = oversample.fit_resample(X, y)

        data_smote = pd.concat([y_smote, X_smote], axis = 1)

        # NOMINAL
        # "job_name"
        nominal_cols = data_smote[["job_name"]]
        encoded_nominal = pd.get_dummies(data = nominal_cols)
        # Gabungkan ke data asli
        data_smote = pd.concat(objs = [encoded_nominal, data_smote], axis = 1)
        data_smote.drop(nominal_cols, axis = 1, inplace = True)

        #--------------- Target Engineering
        from scipy import stats

        # Transformasi Box-Cox
        transformed_data, best_lambda = stats.boxcox(data_smote["median_gaji"]) 
        nilai_lambda = best_lambda

        # Transformasikan
        data_smote["median_gaji"] = transformed_data

        #--------------- K-Fold Cross Validation
        # Kalau pakai k-fold cross validation berarti langsung pakai semua data
        X = data_smote.drop("median_gaji", axis = 1)
        y = data_smote["median_gaji"]

        #--------------- Modelling Decision Tree
        from sklearn.tree import DecisionTreeRegressor

        # COBA BUAT MODEL BERDASARKAN "best_params_"
        tree_x1 = DecisionTreeRegressor(random_state = 0, 
                                        criterion = 'mae',
                                        max_depth = 2,
                                        max_features = 'sqrt',
                                        min_samples_split = 2)

        from sklearn.model_selection import cross_val_score
        from numpy import mean
        from numpy import absolute

        r2_tree_x1 = cross_val_score(estimator = tree_x1, X = X, y = y, cv = 5, scoring  ='r2')
        nrmse_tree_x1 = cross_val_score(estimator = tree_x1, X = X, y = y, cv = 5, scoring  ='neg_root_mean_squared_error')
        nmape_tree_x1 = cross_val_score(estimator = tree_x1, X = X, y = y, cv = 5, scoring  ='neg_mean_absolute_percentage_error')

        r2_tree_x1 = mean(r2_tree_x1)
        rmse_tree_x1 = mean(absolute(nrmse_tree_x1))
        mape_tree_x1 = mean(absolute(nmape_tree_x1))

        r2_tree.append(r2_tree_x1)
        rmse_tree.append(rmse_tree_x1)
        mape_tree.append(mape_tree_x1)

        df = pd.DataFrame(list(zip(r2_tree, rmse_tree,  mape_tree)), columns = ["r2_tree", "rmse_tree", "mape_tree"])
    
    return df

In [None]:
from datetime import datetime
start_time = datetime.now()

score_lc = learning_curve(data, oversample = range(37, 101))

end_time = datetime.now()
print("Durasi Learning Curve: {}".format(end_time - start_time))

In [None]:
score_lc

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Grafik R2
plt.plot(range(37, 101), score_lc[0:64]["r2_tree"], linewidth = 4)
plt.title("SMOTE Learning Curve (R2)", fontsize = 16)
plt.gca().set_xlabel("# of Points per Class", fontsize = 14)
plt.gca().set_ylabel("Cross Val R2", fontsize = 14)
sns.despine()
pass

In [None]:
# Grafik RMSE
plt.plot(range(37, 101), score_lc[0:64]["rmse_tree"], linewidth = 4)
plt.title("SMOTE Learning Curve (RMSE)", fontsize = 16)
plt.gca().set_xlabel("# of Points per Class", fontsize = 14)
plt.gca().set_ylabel("Cross Val RMSE", fontsize = 14)
sns.despine()
pass

In [None]:
# Grafik MAPE
plt.plot(range(37, 101), score_lc[0:64]["mape_tree"], linewidth = 4)
plt.title("SMOTE Learning Curve (MAPE)", fontsize = 16)
plt.gca().set_xlabel("# of Points per Class", fontsize = 14)
plt.gca().set_ylabel("Cross Val MAPE", fontsize = 14)
sns.despine()
pass

In [None]:
# InsyaAlloh SMOTE-nya pakai n = 87