In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn import set_config
set_config(display='diagram') # Để trực quan hóa pipeline

[Request Link](https://globalweather.tamu.edu/request/view/36316)\
[Direct download](https://globalweather.tamu.edu/data/cfsr/36316_2020-12-31-02-56-09.zip)

In [None]:
df = pd.read_csv('weatherdata-1611081.csv', index_col=False)
df

In [None]:
df.info()

In [None]:
display_range = 5000
for col in [ 'Max Temperature', 'Min Temperature', 'Precipitation', 'Wind', 'Relative Humidity','Solar']:
    plt.figure(figsize=(20,5))
    plt.scatter(pd.RangeIndex(start=0, stop=display_range, step=1), df[col].head(display_range))
    plt.title(label=col)

In [None]:
mean_temp = (df['Max Temperature'] + df['Min Temperature'])/2
mean_temp.head(10000).reset_index().plot(x='index', y=0, kind = 'scatter', figsize=[20,10])

In [None]:
df.insert(loc=3, column='Mean Temperature', value=mean_temp)
df

---
# Derive n<sup>th</sup> day features pipeline (and drop some unnecessary columns):

In [None]:
class ColAdderDropper(BaseEstimator, TransformerMixin):
    def __init__(self, nth_day_features=0):
        self.nth_day_features = nth_day_features
    def fit(self, X_df, y=None):
        return self
    def derive_nth_day_feature(self, X, feature, N):
        rows = X.shape[0]
        nth_prior_measurements = [None]*N + [X[feature][i-N] for i in range(N, rows)]
        col_name = "{} {}".format(feature, N)
        X[col_name] = nth_prior_measurements
    def transform(self, X, y=None):
        X_cl = X.copy()
        drop_col = X_cl.columns
        X_cl.drop(['Longitude','Latitude','Elevation','Date', 'Mean Temperature'], axis=1, errors='ignore', inplace=True)
        for feature in X_cl.columns:
            for N in range(1, self.nth_day_features + 1):
                self.derive_nth_day_feature(X_cl, feature, N)
        X_cl.drop(drop_col, axis=1, errors='ignore', inplace=True)
        X_cl.fillna(method='bfill', inplace=True)
        return X_cl

In [None]:
coladderdropper = make_pipeline(ColAdderDropper(nth_day_features=3))
preprocessed_train_X = coladderdropper.transform(df)
preprocessed_train_X

---

In [None]:
X = coladderdropper.transform(df)
y = df['Mean Temperature']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)



In [None]:

regressor = LinearRegression()

regressor.fit(X_train, y_train)


# make a prediction set using the test set
prediction = regressor.predict(X_test)

# Evaluate the prediction accuracy of the model
from sklearn.metrics import mean_absolute_error, median_absolute_error
print("The Explained Variance: %f" % regressor.score(X_test, y_test))

---

# Using Neural Network:

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import sklearn
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score

In [None]:
display(X.head(5))
display(y.head(5))

In [None]:
# Tách tập huấn luyện và tập validation theo tỉ lệ 80:20

X_train_df, X_val_df, y_train_df, y_val_df = train_test_split(X_train, y_train, test_size=0.25, random_state=0)



In [None]:
print(X_train_df.shape)
print(y_train_df.shape)

In [None]:
pipeline = make_pipeline(StandardScaler(), 
                         MLPRegressor(hidden_layer_sizes=(8,,8), activation='tanh', solver='adam', max_iter=5000))
train_errs = []
val_errs = []
alphas = [0.01, 0.15, 1, 3,5,6,10]
best_val_err = float('inf'); best_alpha = None;
                         
for alpha in alphas:
    pipeline.set_params(mlpregressor__alpha = alpha)
    pipeline.fit(X_train_df, y_train_df)
    pipeline.predict(X_val_df)
    train_err = (1 - pipeline.score(X_train_df, y_train_df))*100
    val_err = (1 - pipeline.score(X_val_df, y_val_df))*100
    if val_err < best_val_err:
        best_val_err = val_err
        best_alpha = alpha
    train_errs.append(train_err)
    val_errs.append(val_err)
'Done'

In [None]:
plt.plot(train_errs,color="red",)
plt.plot(val_errs,color="yellow")

In [None]:
train_errs_df = pd.DataFrame(data=np.array(train_errs).reshape(len(alphas), -1),
                             index=alphas)
val_errs_df = pd.DataFrame(data=np.array(val_errs).reshape(len(alphas), -1), 
                           index=alphas)
min_err = min(min(train_errs), min(val_errs))
max_err = max(max(train_errs), max(val_errs))
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
sns.heatmap(train_errs_df, vmin=min_err, vmax=max_err, square=True, annot=True, 
            cbar=False, fmt='.1f', cmap='Reds')
plt.title('train errors'); plt.xlabel('num_top_titles'); plt.ylabel('alpha')
plt.subplot(1, 2, 2)
sns.heatmap(val_errs_df, vmin=min_err, vmax=max_err, square=True, annot=True, 
            cbar=False, fmt='.1f', cmap='Reds')
plt.title('validation errors'); plt.xlabel('num_top_titles'); plt.ylabel('alpha');

In [None]:
pipeline.set_params(mlpregressor__alpha = best_alpha)
pipeline.fit(X_train, y_train)
pipeline.predict(X_test)
test_score=pipeline.score(X_test,y_test)
print("The Explained Variance: %f" % test_score)