
# Load Data, Drop Unecessary Information

In [65]:
import pandas as pd
import os

In [66]:
def create_complete_df(num_files):
    path_to_features  = '/content/drive/MyDrive/SOC/Data/'

    list_of_dfs = []
    for i in range(num_files):
        csv_file = f'TOAFastFeatures{i + 1}.csv'
        print(csv_file)
        df = pd.read_csv(os.path.join(path_to_features, csv_file))
        list_of_dfs.append(df)

    df = pd.concat(list_of_dfs, axis=0)
    return df

In [67]:
df = create_complete_df(1)

TOAFastFeatures1.csv


In [68]:
df.reset_index(inplace = True)

In [69]:
df.drop(['index'], axis = 1, inplace = True)

# Iterative Imputer

In [70]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(verbose=2, max_iter=20)

imputed_df = pd.DataFrame(imputer.fit_transform(df), columns = df.columns)
imputed_df

[IterativeImputer] Completing matrix with shape (1726, 150)
[IterativeImputer] Ending imputation round 1/20, elapsed time 11.73
[IterativeImputer] Change: 939.7315853459478, scaled tolerance: 3545.9554688690087 
[IterativeImputer] Early stopping criterion reached.


Unnamed: 0,AFRI1600,AFRI2100,ANDWI,AVI,AWEInsh,AWEIsh,B1,B10,B11,B2,...,TriVI,UI,VARI,VI6T,VIBI,VIG,VgNIRBI,VrNIRBI,WI1,WI2
0,-0.042037,0.188964,-0.302613,0.178517,-0.062937,-0.289623,0.127853,292.999382,292.361504,0.111648,...,2.313388,0.153771,-0.058710,0.662873,0.402603,-0.029935,-0.193855,-0.165107,-0.337681,-0.287793
1,0.040983,0.240694,-0.376670,0.318068,-0.187675,-0.947703,0.172250,313.899918,312.935503,0.166961,...,2.706954,0.100678,-0.273590,0.861794,0.476970,-0.186807,-0.328115,-0.150532,-0.415051,-0.512212
2,0.256086,0.483660,-0.004223,0.201418,0.292538,0.001921,0.221778,291.975769,290.951062,0.201033,...,3.362856,-0.184076,0.096392,0.753567,1.639177,0.038192,-0.090044,-0.127691,0.096108,0.166270
3,0.116418,0.277009,-0.215824,0.240299,0.353594,-0.547265,0.239610,313.394856,313.239800,0.248634,...,-0.124035,0.062049,-0.191491,0.863645,0.416230,-0.120620,-0.183741,-0.064552,-0.243018,-0.322212
4,0.094604,0.338201,-0.242855,0.232782,-0.023527,-0.290984,0.158154,305.049629,302.127800,0.142029,...,4.205121,-0.008196,-0.046426,0.736685,0.662532,-0.021764,-0.230045,-0.209671,-0.222506,-0.170213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1721,0.231414,0.590910,-0.380825,0.319432,-0.280356,-0.404473,0.116841,298.614562,296.120704,0.096284,...,9.552927,-0.323608,0.169112,0.767804,1.075321,0.053107,-0.471480,-0.509807,-0.170280,-0.094464
1722,0.139767,0.315097,-0.265381,0.323284,0.242044,-0.882226,0.242697,288.616457,288.614634,0.254960,...,1.740761,0.020362,-0.221458,0.903888,0.614974,-0.151163,-0.253370,-0.106281,-0.272308,-0.400770
1723,0.144147,0.389114,-0.140860,0.266714,0.287478,-0.379172,0.279064,310.193457,308.096285,0.295164,...,2.552596,-0.064120,-0.096375,0.868078,0.576498,-0.055979,-0.140060,-0.084755,-0.076610,-0.134603
1724,0.649607,0.878403,0.017765,0.264676,0.104943,-0.046594,0.123984,295.756511,293.847276,0.098381,...,7.686679,-0.771560,0.847155,0.642376,6.547221,0.179263,-0.314948,-0.453854,0.581072,0.661976


# Merge HWSD Data

In [71]:
HWSD = pd.read_csv('/content/drive/MyDrive/SOCData/HWSD_DATA.csv')
HWSD = HWSD[['MU_GLOBAL', 'S_OC']]
HWSD

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,MU_GLOBAL,S_OC
0,7001,
1,7002,
2,7003,
3,7004,32.89
4,7005,
...,...,...
48143,32049,0.13
48144,32049,
48145,32050,
48146,32050,0.13


In [72]:
HWSD = HWSD.groupby('MU_GLOBAL').mean()

In [73]:
new = pd.merge(imputed_df, HWSD, on="MU_GLOBAL")
new = new.dropna()

# Split Data

In [74]:
y = new.pop('S_OC')
X = new

In [75]:
assert 'S_OC' not in X.columns

In [76]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

In [77]:
assert len(X_train) == len(y_train)

In [78]:
assert len(X_test) == len(y_test)

# Feature Standardization

In [79]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

# Decision Tree Regressor

In [115]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

DecisionTreeRegressor()

# Random Forest Regressor

In [116]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor()

# XGBoost

In [128]:
from xgboost import XGBRegressor

X_train_xgb, X_val, y_train_xgb, y_val = train_test_split(X_train, y_train)

xgb = XGBRegressor(n_estimators=1500, learning_rate=0.01, eval_metric="rmse", early_stopping_rounds=10)
xgb.fit(X_train, y_train, verbose =100)



XGBRegressor(early_stopping_rounds=10, eval_metric='rmse', learning_rate=0.01,
             n_estimators=1500)

# SVM

In [118]:
from sklearn.svm import SVR

svr = SVR(C=1.0, epsilon=0.2)
svr.fit(X_train, y_train)

SVR(epsilon=0.2)

# LGBM

In [119]:
import lightgbm

hyper_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.005,
    'verbose': -1,
    'n_estimators': 1000,
    'random_state' : 0
}

lgbm = lightgbm.LGBMRegressor(**hyper_params)

lgbm.fit(X_train_xgb,
         y_train_xgb,
         eval_set = [(X_val, y_val)],
         callbacks = [lightgbm.early_stopping(stopping_rounds = 20)],
         verbose = 100
)

Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:
[71]	valid_0's rmse: 0.276717


LGBMRegressor(learning_rate=0.005, metric='rmse', n_estimators=1000,
              objective='regression', random_state=0, verbose=-1)

# RMSE

In [129]:
from sklearn.metrics import mean_squared_error

models_list = {
    "Decision Tree" : dt,
    "Random Forest" : rf,
    "XGBoost" : xgb,
    "SVR" : svr,
    "LGBM" : lgbm,
}

for name, model in models_list.items():
    print(name + ": " + str(round(mean_squared_error(model.predict(X_test), y_test), 5)))

Decision Tree: 4.39103
Random Forest: 3.18231
XGBoost: 5.75293
SVR: 2.92049
LGBM: 2.87813
