# Compare sklearn models




In [1]:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from matplotlib import pyplot as plt


In [2]:

#  load data
data_dir = '/Documents/Python/Nutrients/match_Acolite_L2/NN_input/with_qc'
train_data = pd.read_csv(os.path.join(data_dir, 'train_autogl_mean.csv'))
test_data = pd.read_csv(os.path.join(data_dir, 'test_autogl_mean.csv'))

train_data = train_data.drop(['lon', 'lat', 'Id'], axis=1)
test_data = test_data.drop(['lon', 'lat', 'Id'], axis=1)


In [3]:
train_data.head()

In [4]:

def convert_to_datetime(df):
    "extract year, month, day and day of week"
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.dayofweek

    df = df.drop(['date'], axis=1)
    return df


In [5]:
train_data = convert_to_datetime(train_data)
test_data = convert_to_datetime(test_data)
train_data.head()

In [6]:


# choose DIN as the example
label_output = ['DIN','DIP']
X_train, y_train = train_data.drop(label_output,axis=1), np.log(train_data[label_output[0]])
X_test, y_test = test_data.drop(label_output,axis=1), np.log(test_data[label_output[0]])

In [7]:


from nutrient_utils.my_utils import cal_performance

models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('ElasticNet', ElasticNet()),
    ('Decision Tree', DecisionTreeRegressor()),
    ('Random Forest', RandomForestRegressor()),
    ('Gradient Boosting', GradientBoostingRegressor()),
    ('SVR', SVR()),
    ('KNN', KNeighborsRegressor()),
    ('MLP', MLPRegressor(max_iter=1000))
]


results = []
results_train = []
names = []

# validate each model
for name, model in models:
    model.fit(X_train, y_train)  # train the model
    y_pred = model.predict(X_test)  # predict on test data
    scores = cal_performance(y_test, y_pred)
    results.append(scores)
    
    y_pred_train = model.predict(X_train)  # predict on train data
    scores_train = cal_performance(np.exp(y_train), np.exp(y_pred_train))
    results_train.append(scores_train)

    names.append(name)
    print(f'{name}: Average r2: {scores[0]}, rmse: {scores[1]}')

    # plt.figure()
    # plt.scatter(y_test,y_pred, label='True')
    # plt.title(name + f' R2:{scores[0].round(2)} rmse:{scores[1].round(2)}',)
    # plt.legend()
    # plt.show()



In [8]:
# df_name = pd.DataFrame(names)
df_results = pd.DataFrame(results, index=names, columns=['r2', 'rmse', 'mape', 'n'])
df_results

# round to 2 decimal places
df_results = df_results.round(2)
df_results

In [9]:
df_results_train = pd.DataFrame(results_train, index=names, columns=['r2', 'rmse', 'mape', 'n']).round(2)

df_results_train


In [10]:
# combine test and train results
df_results_all = pd.concat([df_results, df_results_train], axis=1)
df_results_all.columns = ['r2_test', 'rmse_test', 'mape_test', 'n_test', 'r2_train', 'rmse_train', 'mape_train', 'n_train']
df_results_all

# sort by r2
df_results_all = df_results_all.sort_values(by='r2_test', ascending=False)
df_results_all

# change the index name
df_results_all.index.name = 'model'

# save to csv

df_results_all.to_csv('model_performance.csv', index=True)

df_results_all