In [1]:
import requests, json
import openmeteo_requests
import requests_cache
from retry_requests import retry
from datetime import datetime, timedelta
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import matplotlib.pyplot as plt


cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

In [2]:
def get_data(latitude, longitude, startdate, enddate):
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": latitude,
        "longitude": longitude,
        "daily": "temperature_2m_mean",
        "timezone": "Asia/Singapore",
        "start_date": startdate,
        "end_date" : enddate
    }
    data = openmeteo.weather_api(url, params=params)[0].Daily()
    daily_data = {"date": pd.date_range(
        start = pd.to_datetime(data.Time(), unit = "s"),
        end = pd.to_datetime(data.TimeEnd(), unit = "s"),
        freq = pd.Timedelta(seconds = data.Interval()),
        inclusive = "left"
    )}
    daily_data["mean_temp"] = data.Variables(0).ValuesAsNumpy()
    df = pd.DataFrame(data = daily_data)
    return df

In [3]:
data = get_data(latitude="23.7115253", longitude="90.4111451", startdate="2020-01-01", enddate="2023-11-12").dropna()
data['date'] = pd.to_datetime(data['date'])
data['dayofyear'] = data['date'].dt.dayofyear


print(data.shape)
data.head(5)

(1410, 3)


Unnamed: 0,date,mean_temp,dayofyear
0,2019-12-31 16:00:00,19.520666,365
1,2020-01-01 16:00:00,19.993582,1
2,2020-01-02 16:00:00,18.006083,2
3,2020-01-03 16:00:00,17.570665,3
4,2020-01-04 16:00:00,16.972748,4


In [4]:
def get_model_acc(model_object, X_train, y_train, X_test, y_test):
    model_object.fit(X_train, y_train)
    predictions1 = model_object.predict(X_test)
    mse = mean_squared_error(y_test, predictions1)
    return model_object, mse

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data[['dayofyear']], data['mean_temp'], test_size=0.2, random_state=42)

model1, loss1 = get_model_acc(model_object=LinearRegression(), 
                              X_train=X_train, 
                              y_train=y_train, 
                              X_test=X_test, 
                              y_test=y_test)

model2, loss2 = get_model_acc(model_object=RandomForestRegressor(n_estimators=200, random_state=42), 
                              X_train=X_train, 
                              y_train=y_train, 
                              X_test=X_test, 
                              y_test=y_test)

model3, loss3 = get_model_acc(model_object=DecisionTreeRegressor(random_state=42), 
                              X_train=X_train, 
                              y_train=y_train, 
                              X_test=X_test, 
                              y_test=y_test)

model4, loss4 = get_model_acc(model_object=XGBRegressor(random_state=42, max_depth=3, eta=0.1), 
                              X_train=X_train, 
                              y_train=y_train, 
                              X_test=X_test, 
                              y_test=y_test)

print(f"Loss for Linear Regressor is {loss1}")
print(f"Loss for RandomForest Regressor is {loss2}")
print(f"Loss for DecisionTree Regressor is {loss3}")
print(f"Loss for XGB Regressor is {loss4}")


Loss for Linear Regressor is 16.835897597973187
Loss for RandomForest Regressor is 2.5878580441617753
Loss for DecisionTree Regressor is 2.8523932197515696
Loss for XGB Regressor is 1.8677047491073608


In [6]:
## Hyper Parameter Optimization
base_score=[0.25,0.5,0.75,1]


n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
booster=['gbtree','gblinear']
learning_rate=[0.05,0.1,0.15,0.20]
min_child_weight=[1,2,3,4]

hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight,
    'base_score':base_score
    }

# Set up the random search with 4-fold cross validation
random_cv = RandomizedSearchCV(estimator=XGBRegressor(),
            param_distributions=hyperparameter_grid,
            cv=5, n_iter=50,
            scoring = 'neg_mean_absolute_error',n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=42)

random_cv.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[CV 2/5] END base_score=0.5, learning_rate=0.1, max_depth=3, min_child_weight=2, n_estimators=500;, score=(train=-0.909, test=-1.166) total time=   0.1s
[CV 1/5] END base_score=0.5, learning_rate=0.1, max_depth=3, min_child_weight=2, n_estimators=500;, score=(train=-0.929, test=-1.147) total time=   0.1s
[CV 5/5] END base_score=0.5, learning_rate=0.1, max_depth=3, min_child_weight=2, n_estimators=500;, score=(train=-0.930, test=-1.054) total time=   0.1s
[CV 4/5] END base_score=0.5, learning_rate=0.1, max_depth=3, min_child_weight=2, n_estimators=500;, score=(train=-0.902, test=-1.183) total time=   0.1s
[CV 3/5] END base_score=0.5, learning_rate=0.1, max_depth=3, min_child_weight=2, n_estimators=500;, score=(train=-0.908, test=-1.167) total time=   0.2s
[CV 1/5] END base_score=0.25, learning_rate=0.2, max_depth=5, min_child_weight=3, n_estimators=1500;, score=(train=-0.915, test=-1.228) total time=   0.4s
[CV 3/5] END base_score=0.25, learning_rate=0.2, max_depth=5, min_child_weight=3

In [7]:
print(random_cv.best_estimator_)

XGBRegressor(base_score=0.25, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=3, max_leaves=None,
             min_child_weight=4, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=100, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)


In [8]:
model = XGBRegressor(base_score=0.25, booster=None, callbacks=None,
                     colsample_bylevel=None, colsample_bynode=None,
                     colsample_bytree=None, device=None, early_stopping_rounds=None,
                     enable_categorical=False, eval_metric=None, feature_types=None,
                     gamma=None, grow_policy=None, importance_type=None,
                     interaction_constraints=None, learning_rate=0.1, max_bin=None,
                     max_cat_threshold=None, max_cat_to_onehot=None,
                     max_delta_step=None, max_depth=3, max_leaves=None,
                     min_child_weight=4, monotone_constraints=None,
                     multi_strategy=None, n_estimators=100, n_jobs=None,
                     num_parallel_tree=None, random_state=None)

get_model_acc(model_object=model, 
              X_train=X_train, 
              y_train=y_train, 
              X_test=X_test, 
              y_test=y_test)

(XGBRegressor(base_score=0.25, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=3, max_leaves=None,
              min_child_weight=4, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...),
 1.8749692)

In [9]:
## Prediction
new_date = pd.to_datetime('2023-11-30')
day_of_year = new_date.dayofyear
predicted_temperature = model.predict([[day_of_year]])
print(f'Predicted Avarage Temperature for {str(new_date).split(" ")[0]} : {predicted_temperature[0]}')

Predicted Avarage Temperature for 2023-11-30 : 21.576915740966797


In [10]:
## Save the model
model_filename = 'xgb_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)
print(f'Model saved as {model_filename}')

Model saved as xgb_model.pkl
