<a href="https://colab.research.google.com/github/amoniaka-knabino/sirius_climate_2024/blob/main/compare_linreg_catboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [110]:
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression

column_name = ['year', 'mon', 'day', 'hour', 'lat', 'lon', 'hsun', 'slp', 'ta', 'sst', 'td', 'rh', 'icn', 'icl', 'low', 'mid', 'high', 'iw', 'ws', 'wd']
numbers = range(2, 3)

In [101]:
df_2017 = pd.read_fwf("2017_01_data.fwf", names=column_name)

for n in [str(num).zfill(2) for num in numbers]:
  df_next_file = pd.read_fwf(f"2017_{n}_data.fwf", names=column_name)
  df_2017 = pd.concat([df_2017, df_next_file], axis=0)

In [102]:
df_2016 = pd.read_fwf("2016_01_data.fwf", names=column_name)

for n in [str(num).zfill(2) for num in numbers]:
  df_next_file = pd.read_fwf(f"2016_{n}_data.fwf", names=column_name)
  df_2016 = pd.concat([df_2016, df_next_file], axis=0)

In [103]:
y_2017 = df_2017['rh']

df_2017_droped = df_2017.drop(['td', 'rh'], axis=1, inplace=False)
df_2017_droped

Unnamed: 0,year,mon,day,hour,lat,...,mid,high,iw,ws,wd
0,2017,1,1,0,71.3,...,10,10,85,13.9,60.0
1,2017,1,1,0,69.6,...,-9,-9,-99,3.0,300.0
2,2017,1,1,0,66.0,...,10,10,65,14.4,10.0
3,2017,1,1,0,65.3,...,10,10,80,15.4,360.0
4,2017,1,1,0,64.3,...,-9,-9,-99,-9.9,-9.9
...,...,...,...,...,...,...,...,...,...,...,...
520823,2017,2,0,0,19.3,...,7,10,-99,10.3,40.0
520824,2017,2,1,0,28.0,...,10,10,3,17.5,300.0
520825,2017,2,1,0,19.7,...,7,10,-99,10.3,40.0
520826,2017,2,1,0,18.7,...,7,10,-99,10.3,40.0


In [104]:
y_2016 = df_2016['rh']

df_2016_droped = df_2016.drop(['td', 'rh'], axis=1, inplace=False)
df_2016_droped

Unnamed: 0,year,mon,day,hour,lat,...,mid,high,iw,ws,wd
0,2016,1,1,0,69.60,...,-9,-9,-99,8.0,240.0
1,2016,1,1,0,69.07,...,-9,-9,-99,-9.9,-9.9
2,2016,1,1,0,66.00,...,10,10,2,12.9,160.0
3,2016,1,1,0,65.30,...,10,10,2,12.9,180.0
4,2016,1,1,0,64.30,...,-9,-9,-99,11.3,160.0
...,...,...,...,...,...,...,...,...,...,...,...
432965,2016,2,9,8,17.70,...,-9,-9,-99,0.0,280.0
432966,2016,2,9,8,17.00,...,-9,-9,-99,4.0,320.0
432967,2016,2,9,8,11.70,...,-9,-9,-99,2.0,110.0
432968,2016,2,9,8,11.70,...,-9,-9,-99,1.0,340.0


In [108]:
x_train, y_train = df_2016_droped, y_2016
x_test, y_test = df_2017_droped, y_2017

#x_train = x_train.reshape(-1, 1)

model = LinearRegression().fit(x_train, y_train)

y_pred = model.predict(x_test)

R_2 = r2_score(y_test, y_pred)
MSE = mean_squared_error(y_test, y_pred)

print('R**2 score:', R_2)
print('RMSE:', sqrt(MSE))

R**2 score: 0.02121383819789513
RMSE: 0.16964386118400507


In [111]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [114]:
from catboost import CatBoostRegressor, Pool

In [117]:
model_cat = CatBoostRegressor(iterations=5,
                          depth=2,
                          learning_rate=1,
                          loss_function='RMSE')

#train_pool = Pool(train_data, train_label, cat_features=[0,2,5])
#test_pool = Pool(test_data, cat_features=[0,2,5])

train_pool = Pool(x_train, y_train, cat_features=[])
test_pool = Pool(x_test, y_test, cat_features=[])

model_cat.fit(train_pool)
y_pred_cat = model_cat.predict(test_pool)

R_2_catboost = r2_score(y_test, y_pred_cat)
MSE_catboost = mean_squared_error(y_test, y_pred_cat)

print('R**2 catboost score:', R_2_catboost)
print('RMSE catboost:', sqrt(MSE_catboost))

0:	learn: 0.1387367	total: 470ms	remaining: 1.88s
1:	learn: 0.1365723	total: 808ms	remaining: 1.21s
2:	learn: 0.1353183	total: 1.13s	remaining: 755ms
3:	learn: 0.1341798	total: 1.45s	remaining: 363ms
4:	learn: 0.1334626	total: 1.79s	remaining: 0us
R**2 catboost score: 0.021543666017524332
RMSE catboost: 0.16961527578835983
