In [None]:
import pandas as pd
df_list = [pd.read_csv('/content/drive/MyDrive/SOC/Data/AgEvidence/continuous cover/continuous cover0.csv')]
df = pd.concat(df_list, axis = 0)
df

In [None]:
df.rv = df.rv.str.lower()

In [None]:
df = df.query('rv.str.contains("carbon") or rv.str.contains("organic matter fraction")', engine='python')

In [None]:
df

In [None]:
df.rvUnits.value_counts()

In [None]:
df.columns

One-hot encoding

In [None]:
df["control"].value_counts()

In [None]:
def encode(data, variable):
    return int(variable in data)

In [None]:
import ast

In [None]:
controls = set()
for val, count in df["control"].value_counts().iteritems():    
    for control in ast.literal_eval(val):
        controls.add(control)
controls

In [None]:
treatments = set()
for val, count in df["treatment"].value_counts().iteritems():    
    for treatment in ast.literal_eval(val):
        treatments.add(treatment)
treatments

In [None]:
croptype = set()
for val, count in df["croptype"].value_counts().iteritems():    
    for _croptype in ast.literal_eval(val):
        croptype.add(_croptype)
croptype

In [None]:
for x in controls:
    df[x + "_control"] = df['control'].apply(encode, args = [x])
    
for x in treatments:
    df[x + "_treatment"] = df['treatment'].apply(encode, args = [x])

for x in croptype:
    df[x + "_croptype"] = df['croptype'].apply(encode, args = [x])

In [None]:
df.columns

In [None]:
feature_cols = [
    "rvUnits",
    "startYear",
    "studyLength",
    "sampleDepth",
    "controlValue",
    "norm",
    "numspecies",
    "speciestype",
    "fertilization",
    'bare soil_control', 'monocrop_control', 'intercrop_treatment',
       'monocrop_treatment', 'alley crop_treatment', 'Tree_croptype',
       'Bean_croptype', 'Bare Soil_croptype', 'Vegetable_croptype',
       'Cereal_croptype'
]

In [None]:
X = df[feature_cols]
y = df['percentChange']

In [None]:
X.isnull().sum()

In [None]:
X.dtypes

In [None]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()
to_encode = ["rvUnits", "sampleDepth", "speciestype", "fertilization"]

X[to_encode] = encoder.fit_transform(X[to_encode])

In [None]:
X.dtypes

In [None]:
X.fillna(0, inplace = True)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)

# Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

# XGBoost

In [None]:
from xgboost import XGBRegressor

X_train_xgb, X_val, y_train_xgb, y_val = train_test_split(X_train, y_train)

xgb = XGBRegressor(n_estimators=1500, learning_rate=0.01, eval_metric="rmse", early_stopping_rounds=10)
xgb.fit(X_train, y_train, verbose =100)

# SVM

In [None]:
from sklearn.svm import SVR

svr = SVR(C=1.0, epsilon=0.2)
svr.fit(X_train, y_train)

# LGBM

In [None]:
import lightgbm

hyper_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.005,
    'verbose': -1,
    'n_estimators': 1000,
    'random_state' : 0
}

lgbm = lightgbm.LGBMRegressor(**hyper_params)

lgbm.fit(X_train_xgb,
         y_train_xgb,
         eval_set = [(X_val, y_val)],
         callbacks = [lightgbm.early_stopping(stopping_rounds = 20)],
         verbose = 100
)

# RMSE

In [None]:
from sklearn.metrics import mean_squared_error

models_list = {
    "Decision Tree" : dt,
    "Random Forest" : rf,
    "XGBoost" : xgb,
    "SVR" : svr,
    "LGBM" : lgbm,
}

for name, model in models_list.items():
    print(name + ": " + str(round(mean_squared_error(model.predict(X_test), y_test, squared = False), 5)))