In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
import copy
import seaborn as sns
import tensorflow as tf
from sklearn.linear_model import LinearRegression

Dataset:
Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

Source: Data Source :http://data.seoul.go.kr/ SOUTH KOREA PUBLIC HOLIDAYS. URL: publicholidays.go.kr

In [18]:
dataset_cols = ["bike_count", "hour", "temp", "humidity", "wind", "visibility", "dew_pt_temp", "radiation", "rain", "snow", "functional"]
total_df = pd.read_csv('seoul-bike-sharing-demand/SeoulBikeData.csv')
df = total_df.drop(["Date", "Holiday", "Seasons"], axis = 1)

In [19]:
df.columns = dataset_cols
df['functional'] = (df['functional'] == 'Yes').astype('int')
df = df[df["hour"] == 12]
df = df.drop(["hour"], axis=1)

In [20]:
df.head()

Unnamed: 0,bike_count,temp,humidity,wind,visibility,dew_pt_temp,radiation,rain,snow,functional
12,449,1.7,23,1.4,2000,-17.2,1.11,0.0,0.0,1
36,479,4.3,41,1.3,1666,-7.8,1.09,0.0,0.0,1
60,333,5.8,85,1.7,349,3.4,0.43,0.0,0.0,1
84,393,-0.3,38,4.8,1823,-12.9,1.11,0.0,0.0,1
108,321,-2.3,25,0.0,1962,-19.7,0.0,0.0,0.0,1


In [None]:
for label in df.columns[1:]:
    plt.scatter(df[label], df['bike_count'])
    plt.title(label)
    plt.ylabel('Bike count at noon')
    plt.xlabel(label)
    plt.show()

In [22]:
df = df.drop(["wind", "visibility", "functional"], axis=1)

In [23]:
df.head()

Unnamed: 0,bike_count,temp,humidity,dew_pt_temp,radiation,rain,snow
12,449,1.7,23,-17.2,1.11,0.0,0.0
36,479,4.3,41,-7.8,1.09,0.0,0.0
60,333,5.8,85,3.4,0.43,0.0,0.0
84,393,-0.3,38,-12.9,1.11,0.0,0.0
108,321,-2.3,25,-19.7,0.0,0.0,0.0


Train / Valid / Test Dataset

In [37]:
train, val, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])

  return bound(*args, **kwds)


In [39]:
def get_xy(dataframe, y_label, x_labels = None):
    dataframe = copy.deepcopy(dataframe)

    if x_labels is None:
        x = dataframe[[c for c in dataframe.columns if c != y_label]].values
    else:
        if len(x_labels) == 1:
            x = dataframe[x_labels[0]].values .reshape(-1,1)
        else:
            x = dataframe[x_labels].values
    
    y = dataframe[y_label].values.reshape(-1,1)
    data = np.hstack((x,y))

    return data, x, y

In [40]:
_, x_train_temp, y_train_temp = get_xy(train, "bike_count", x_labels=["temp"])
_, x_val_temp, y_val_temp = get_xy(val, "bike_count", x_labels=["temp"])
_, x_test_temp, y_test_temp = get_xy(test, "bike_count", x_labels=["temp"])

In [41]:
temp_reg = LinearRegression()
temp_reg.fit(x_train_temp, y_train_temp)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [42]:
print(temp_reg.coef_, temp_reg.intercept_)

[[21.32559694]] [369.82672486]


In [43]:
temp_reg.score(x_test_temp, y_test_temp)

0.2940890807142579

In [None]:
plt.scatter(x_train_temp, y_train_temp, label="Data", color="blue")
x = tf.linspace(-20, 40, 100)
plt.plot(x, temp_reg.predict(np.array(x).reshape(-1,1)), label="Fit", color="red", linewidth=3)
plt.legend()
plt.title("Bike Vs Temp")
plt.ylabel("Number of Bikes")
plt.xlabel("Temp")
plt.show()

Multiple Liner Regresstion

In [57]:
train, val, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])
_, x_train_all, y_train_all = get_xy(train, "bike_count", x_labels=df.columns[1:])
_, x_val_all, y_val_all = get_xy(val, "bike_count", x_labels=df.columns[1:])
_, x_test_all, y_test_all = get_xy(test, "bike_count", x_labels=df.columns[1:])

  return bound(*args, **kwds)


In [58]:
all_reg = LinearRegression()
all_reg.fit(x_train_all, y_train_all)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [59]:
all_reg.score(x_test_all, y_test_all)

0.5299234540982847

Regression with Neural Net

In [60]:
temp_normalizer = tf.keras.layers.Normalization(input_shape = (1,), axis=None)
temp_normalizer.adapt(x_train_temp.reshape(-1))

  super().__init__(**kwargs)


In [61]:
temp_nn_model = tf.keras.Sequential([
    temp_normalizer,
    tf.keras.layers.Dense(1)
])

In [62]:
temp_nn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss='mean_squared_error')

In [65]:
history = temp_nn_model.fit(
    x_train_temp.reshape(-1), y_train_temp,
    verbose=0,
    epochs=1000,
    validation_data=(x_val_temp, y_val_temp)
)

In [68]:
def plot_loss(history):
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.xlabel('Epoch')
    plt.ylabel('MSE')
    plt.grid(True)
    plt.show()


In [None]:
plot_loss(history)

In [None]:
plt.scatter(x_train_temp, y_train_temp, label="Data", color="blue")
x = tf.linspace(-20, 40, 100)
plt.plot(x, temp_nn_model.predict(np.array(x).reshape(-1,1)), label="Fit", color="red", linewidth=3)
plt.legend()
plt.title("Bike Vs Temp")
plt.ylabel("Number of Bikes")
plt.xlabel("Temp")
plt.show()

In [77]:
temp_normalizer = tf.keras.layers.Normalization(input_shape = (1,), axis=None)
temp_normalizer.adapt(x_train_temp.reshape(-1))

nn_model = tf.keras.Sequential([
    temp_normalizer,
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='relu')
])

nn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')

In [78]:
history = nn_model.fit(
    x_train_temp, y_train_temp,
    verbose=0,
    epochs=100,
    validation_data=(x_val_temp, y_val_temp)
)

In [None]:
plot_loss(history)

In [None]:
plt.scatter(x_train_temp, y_train_temp, label="Data", color="blue")
x = tf.linspace(-20, 40, 100)
plt.plot(x, nn_model.predict(np.array(x).reshape(-1,1)), label="Fit", color="red", linewidth=3)
plt.legend()
plt.title("Bike Vs Temp")
plt.ylabel("Number of Bikes")
plt.xlabel("Temp")
plt.show()

In [90]:
print(x_train_all.shape)


(219, 6)


In [91]:
all_normalizer = tf.keras.layers.Normalization(input_shape = (6,), axis=-1)
all_normalizer.adapt(x_train_all)

In [92]:
nn_all_model = tf.keras.Sequential([
    all_normalizer,
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='relu')
])

nn_all_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')

In [95]:
history_all = nn_all_model.fit(
    x_train_all, y_train_all,
    verbose=0,
    epochs=100,
    validation_data=(x_val_all, y_val_all)
)

In [None]:
plot_loss(history_all)

In [99]:
# calculate the MSE for both linear reg and nn
y_pred_lr = all_reg.predict(x_test_all)


In [100]:
y_pred_nn = nn_all_model.predict(x_test_all)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


In [101]:
def MSE(y_pred, y_real):
    return (np.square(y_pred - y_real)).mean()

In [103]:
MSE(y_pred_lr, y_test_all)

np.float64(77810.97639105254)

In [104]:
MSE(y_pred_nn, y_test_all)

np.float64(83167.94048364749)

In [None]:
ax = plt.axes(aspect="equal")
plt.scatter(y_test_all, y_pred_lr, label='Lin Reg Preds')
plt.scatter(y_test_all, y_pred_nn, label='NN Preds')
plt.xlabel("True Values")
plt.ylabel("Predictions")
lims = [0, 1800]
plt.xlim(lims)
plt.ylim(lims)
plt.legend()
_ = plt.plot(lims, lims, c="red")