In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
import copy
import seaborn as sns
import tensorflow as tf
from sklearn.linear_model import LinearRegression

# Sources and Credits

Data Source :http://data.seoul.go.kr/
SOUTH KOREA PUBLIC HOLIDAYS. URL: publicholidays.go.kr


Data Set Information:

Currently Rental bikes are introduced in many urban cities for the enhancement of mobility comfort. It is important to make the rental bike available and accessible to the public at the right time as it lessens the waiting time. Eventually, providing the city with a stable supply of rental bikes becomes a major concern. The crucial part is the prediction of bike count required at each hour for the stable supply of rental bikes.
The dataset contains weather information (Temperature, Humidity, Windspeed, Visibility, Dewpoint, Solar radiation, Snowfall, Rainfall), the number of bikes rented per hour and date information.


Attribute Information:

Date : year-month-day
Rented Bike count - Count of bikes rented at each hour
Hour - Hour of he day
Temperature-Temperature in Celsius
Humidity - %
Windspeed - m/s
Visibility - 10m
Dew point temperature - Celsius
Solar radiation - MJ/m2
Rainfall - mm
Snowfall - cm
Seasons - Winter, Spring, Summer, Autumn
Holiday - Holiday/No holiday
Functional Day - NoFunc(Non Functional Hours), Fun(Functional hours)

In [2]:
dataset_cols = ["bike_count", "hour", "temp", "humidity", "wind", "visibility", "dew_pt_temp", "radiation", "rain", "snow", "functional"]

In [3]:
# Reading the data frame
df = pd.read_csv("SeoulBikeData.csv").drop(["Date", "Holiday", "Seasons"], axis=1)
# showing the top 5 values
# df.head()

In [None]:
# setting the dataframe columns to our relevant data
df.columns = dataset_cols
# convert functional column as type integer
df["functional"] = (df["functional"] == "Yes").astype(int)
df.head()

In [5]:
# Now, we'll use only for time where hour is equal to 12
df = df[df["hour"] == 12]
df = df.drop(["hour"], axis = 1)
df.head()

Unnamed: 0,bike_count,temp,humidity,wind,visibility,dew_pt_temp,radiation,rain,snow,functional
12,449,1.7,23,1.4,2000,-17.2,1.11,0.0,0.0,1
36,479,4.3,41,1.3,1666,-7.8,1.09,0.0,0.0,1
60,333,5.8,85,1.7,349,3.4,0.43,0.0,0.0,1
84,393,-0.3,38,4.8,1823,-12.9,1.11,0.0,0.0,1
108,321,-2.3,25,0.0,1962,-19.7,0.0,0.0,0.0,1


In [None]:
# Now, we are going to print and plot all the data
for label in df.columns[1:]:
    plt.scatter(df[label], df["bike_count"])
    plt.title(label)
    plt.ylabel("Bike count at noon")
    plt.xlabel(label)
    plt.show()

# Training Validation and Test Dataset

In [7]:
train, val, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])

In [37]:
# Writing a function to get xy
def get_xy(dataframe, y_label, x_labels=None):
    # creating a deep copy of the data frame
    # Basically copying everything over
    dataframe = copy.deepcopy(dataframe)
    if x_labels is None:
        # if x_label is not defined then the values from the df are taken as x_labels
        X = dataframe[[c for c in dataframe.columns if c!=y_label]].values
    else:
        if len(x_labels) == 1:
            X = dataframe[x_labels[0]].values.reshape(-1,1)
        else:
            X = dataframe[x_labels].values
    y = dataframe[y_label].values.reshape(-1,1)
    data = np.hstack((X,y))
    
    return data, X, y

In [18]:
_, X_train_temp, y_train_temp = get_xy(train, "bike_count", x_labels=["temp"])
_, X_val_temp, y_val_temp = get_xy(val, "bike_count", x_labels=["temp"])
_, X_test_temp, y_test_temp = get_xy(test, "bike_count", x_labels=["temp"])

In [None]:
# Making an actual regression model
temp_reg = LinearRegression()
temp_reg.fit(X_train_temp, y_train_temp)

In [28]:
# This below code just prints the coefficient and the intercept for the given data model
print(temp_reg.coef_, temp_reg.intercept_)
# now we check assiciation using the R^2 value using the .score method
temp_reg.score(X_train_temp, y_train_temp)
# Here we get the value nearly, 0.34 which is more than zero (which means that there is no assication (At ZERO))
# This value also means that it is also not very good enough

# In short, the higher the number is the higher the number will be co-related

[[20.18040566]] [374.25113507]


0.341875045816599

In [None]:
# Plotting the line of best fit
plt.scatter(X_train_temp, y_train_temp, label="Data", color="blue")
x = tf.linspace(-20,40,100)
plt.plot(x, temp_reg.predict(np.array(x).reshape(-1,1)), label="Fit", color="red", linewidth=3)
plt.legend()
plt.title("Bikes vs Temp")
plt.ylabel("Number of bikes")
plt.xlabel("Temp")
plt.show()

# Multiple Linear Regression

In [39]:
_, X_train_all, y_train_all = get_xy(train, "bike_count", x_labels=df.columns[1:])
_, X_val_all, y_val_all = get_xy(val, "bike_count", x_labels=df.columns[1:])
_, X_test_all, y_test_all = get_xy(test, "bike_count", x_labels=df.columns[1:])

In [40]:
# setting up the Regression Model, and fitting the data into the model
all_reg = LinearRegression()
all_reg.fit(X_train_all, y_train_all)

In [41]:
# Getting the reg coefficient and score
print(all_reg.coef_, all_reg.intercept_)
all_reg.score(X_train_all, y_train_all)

[[-2.97544111e+01 -1.64778125e+01 -4.13868571e+00 -2.23346771e-02
   5.06246550e+01  1.47457314e+02 -1.39235714e+01 -9.46530877e+00
   8.29046526e+02]] [732.17441871]


0.6166037293281997

# Regression with Neural Networks