In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

# Performing Data Wrangling Functions

In [2]:
path = os.path.abspath('C:/Users/Balli/Pictures/prediction')
bb_data =pd.read_csv(path +'/bb-data.csv')
bb_data.info()

#FEATURE ENGINEERING

# Converting tripduration hours to datetime
bb_data['date']=pd.to_datetime(bb_data['tripduration'], unit ='s')

# Separating the hour of the day from  datetime in other to determine lunch our
# And also separating the date from date timee
bb_data['hour_of_day'] = bb_data['date'].dt.hour
bb_data['day_of_week'] = bb_data['date'].dt.dayofweek
bb_data['month'] = bb_data['date'].dt.month
bb_data['year'] = bb_data['date'].dt.year




# Handling possible outlier using the inter quartile range
Q1=bb_data['temp'].quantile(0.25)
Q3 = bb_data['temp'].quantile(0.75)
IQR = Q3 - Q1

# Step 3: Calculate Lower Bound and Upper Bound
lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)

# Calculate the mean
t_mean = bb_data['temp'].mean()

# Replace an number that falls below the lower bound and is above the upper bound with the mean of the data
bb_data['c_temp']=bb_data['temp'].apply(lambda x: t_mean if x < lower_bound or x > upper_bound else x)


# Handling possible outlier using the inter quartile range
hQ1=bb_data['humidity'].quantile(0.25)
hQ3 = bb_data['humidity'].quantile(0.75)
hIQR = hQ3 - hQ1

# Step 3: Calculate Lower Bound and Upper Bound
hlower_bound = hQ1 - (1.5 * hIQR)
hupper_bound = hQ3 + (1.5 * hIQR)

# Calculate the mean
h_mean = bb_data['humidity'].mean()

# Replace an number that falls below the lower bound and is above the upper bound with the mean of the data
bb_data['c_humidity']=bb_data['humidity'].apply(lambda x: h_mean if x < hlower_bound or x > hupper_bound else x)


# Handling possible outlier using the inter quartile range
wQ1=bb_data['windspeed'].quantile(0.25)
wQ3 = bb_data['windspeed'].quantile(0.75)
wIQR = wQ3 - wQ1

# Step 3: Calculate Lower Bound and Upper Bound
wlower_bound = wQ1 - (1.5 * wIQR)
wupper_bound = wQ3 + (1.5 * wIQR)

# Calculate the mean
w_mean = bb_data['windspeed'].mean()
# Replace an number that falls below the lower bound and is above the upper bound with the mean of the data
bb_data['c_windspeed']=bb_data['windspeed'].apply(lambda x: w_mean if x < wlower_bound or x > wupper_bound else x)

fQ1=bb_data['feelslike'].quantile(0.25)
fQ3 = bb_data['feelslike'].quantile(0.75)
fIQR = fQ3 - fQ1

# Step 3: Calculate Lower Bound and Upper Bound
flower_bound = fQ1 - (1.5 * fIQR)
fupper_bound = fQ3 + (1.5 * fIQR)

# Calculate the mean
f_mean = bb_data['feelslike'].mean()
# Replace an number that falls below the lower bound and is above the upper bound with the mean of the data
bb_data['c_feelslike']=bb_data['feelslike'].apply(lambda x: f_mean if x < flower_bound or x > fupper_bound else x)

dQ1=bb_data['dew'].quantile(0.25)
dQ3 = bb_data['dew'].quantile(0.75)
dIQR = dQ3 - dQ1

# Step 3: Calculate Lower Bound and Upper Bound
dlower_bound = dQ1 - (1.5 * dIQR)
dupper_bound = dQ3 + (1.5 * dIQR)

# Calculate the mean
d_mean = bb_data['dew'].mean()
# Replace an number that falls below the lower bound and is above the upper bound with the mean of the data
bb_data['c_dew']=bb_data['dew'].apply(lambda x: d_mean if x < dlower_bound or x > dupper_bound else x)

# Handling possible outlier using the inter quartile range
wdQ1=bb_data['winddir'].quantile(0.25)
wdQ3 = bb_data['winddir'].quantile(0.75)
wdIQR = wdQ3 - wdQ1

# Step 3: Calculate Lower Bound and Upper Bound
lower_bound = wdQ1 - (1.5 * wdIQR)
upper_bound = wdQ3 + (1.5 * wdIQR)

# Calculate the mean
wd_mean = bb_data['winddir'].mean()

# Replace an number that falls below the lower bound and is above the upper bound with the mean of the data
bb_data['c_winddir']=bb_data['winddir'].apply(lambda x: wd_mean if x < lower_bound or x > upper_bound else x)

# Handling possible outlier using the inter quartile range
vQ1=bb_data['visibility'].quantile(0.25)
vQ3 = bb_data['visibility'].quantile(0.75)
vIQR = vQ3 - vQ1

# Step 3: Calculate Lower Bound and Upper Bound
lower_bound = vQ1 - (1.5 * vIQR)
upper_bound = vQ3 + (1.5 * vIQR)

# Calculate the mean
v_mean = bb_data['visibility'].mean()

# Replace an number that falls below the lower bound and is above the upper bound with the mean of the data
bb_data['c_visibility']=bb_data['visibility'].apply(lambda x: v_mean if x < lower_bound or x > upper_bound else x)

# Handling possible outlier using the inter quartile range
cQ1=bb_data['cloudcover'].quantile(0.25)
cQ3 = bb_data['cloudcover'].quantile(0.75)
cIQR = cQ3 - cQ1

# Step 3: Calculate Lower Bound and Upper Bound
lower_bound = cQ1 - (1.5 * cIQR)
upper_bound = cQ3 + (1.5 * cIQR)

# Calculate the mean
c_mean = bb_data['cloudcover'].mean()

# Replace an number that falls below the lower bound and is above the upper bound with the mean of the data
bb_data['c_cloudcover']=bb_data['cloudcover'].apply(lambda x: c_mean if x < lower_bound or x > upper_bound else x)

# Handling possible outlier using the inter quartile range
sQ1=bb_data['solarradiation'].quantile(0.25)
sQ3 = bb_data['solarradiation'].quantile(0.75)
sIQR = sQ3 - sQ1

# Step 3: Calculate Lower Bound and Upper Bound
lower_bound = sQ1 - (1.5 * sIQR)
upper_bound = sQ3 + (1.5 * sIQR)

# Calculate the mean
s_mean = bb_data['solarradiation'].mean()

# Replace an number that falls below the lower bound and is above the upper bound with the mean of the data
bb_data['c_solarradiation']=bb_data['solarradiation'].apply(lambda x: s_mean if x < lower_bound or x > upper_bound else x)

# Handling possible outlier using the inter quartile range
s1Q1=bb_data['solarenergy'].quantile(0.25)
s1Q3 = bb_data['solarenergy'].quantile(0.75)
s1IQR = s1Q3 - s1Q1
# Step 3: Calculate Lower Bound and Upper Bound
lower_bound = s1Q1 - (1.5 * s1IQR)
upper_bound = s1Q3 + (1.5 * s1IQR)

# Calculate the mean
s1_mean = bb_data['solarenergy'].mean()

# Replace an number that falls below the lower bound and is above the upper bound with the mean of the data
bb_data['c_solarenergy']=bb_data['solarenergy'].apply(lambda x: s1_mean if x < lower_bound or x > upper_bound else x)


  bb_data =pd.read_csv(path +'/bb-data.csv')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 38 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   Unnamed: 0               1048575 non-null  int64  
 1   tripduration             1048575 non-null  int64  
 2   starttime                1048575 non-null  object 
 3   stoptime                 1048575 non-null  object 
 4   start station id         1048575 non-null  int64  
 5   start station name       1048575 non-null  object 
 6   start station latitude   1048575 non-null  float64
 7   start station longitude  1048575 non-null  float64
 8   end station id           1048575 non-null  int64  
 9   end station name         1048575 non-null  object 
 10  end station latitude     1048575 non-null  float64
 11  end station longitude    1048575 non-null  float64
 12  bikeid                   1048575 non-null  int64  
 13  usertype                 1048575 non-null 

# Feature Selection and Engineering


In [3]:
#Creating a dataframe  of feature important to our analysis(Feature Engineering)
feature_dataset = bb_data[['month','day_of_week','hour_of_day','c_temp','c_humidity','c_windspeed',
                           'c_feelslike','c_dew', 'snow','c_winddir','c_visibility','c_solarradiation',
                           'c_solarenergy','c_cloudcover','bikeid']]

# # NOW WE CAN DELVE INTO FEATURE SELECTION WHICH MEANS SELECTING ONLY THE COLUMNS THAT ARE 
# # IMPORTANT TO SOLVING OUR PROBLEM
# ''' WEATHER PREDICTOR VARIABLE= TEMPERATURE, HUMIDITY, WINDSPEED, DEW, VISIBILITY, SOLARENERGY
#     DATE PREDICTOR VARIABLE = MONTH,DAY OF WEEK, HOUR OF DAY
#     DEPENDENT VARIABLE = BIKEID'''

#Final dataset for our prediction
pred_dataset = feature_dataset.groupby(['month','day_of_week','hour_of_day','c_temp','c_feelslike','c_humidity',
'c_dew','c_windspeed','snow','c_winddir','c_visibility','c_cloudcover','c_solarradiation','c_solarenergy'] )['bikeid'].count().reset_index()

grouped_dataset_model= pred_dataset.sort_values('bikeid', ascending=False)
print(grouped_dataset_model)


      month  day_of_week  hour_of_day     c_temp  c_feelslike  c_humidity  \
617       1            3            0  10.706801         21.8       16.16   
804       1            3            0  21.100000         21.1       18.53   
775       1            3            0  17.800000         17.8       55.77   
603       1            3            0  10.600000         10.6       77.66   
781       1            3            0  18.200000         18.2       52.95   
...     ...          ...          ...        ...          ...         ...   
2207      1            3            5   2.200000         -0.7       39.79   
2206      1            3            5   1.400000         -3.0       34.88   
2205      1            3            5   1.400000         -3.7       38.34   
2204      1            3            5   0.800000         -4.0       34.66   
3345      2            6           22  20.000000         20.0       20.49   

      c_dew  c_windspeed  snow  c_winddir  c_visibility  c_cloudcover  \
61


# FEATURE EXTRACTION ND SETTING TARGET VARIABLE.
and also Scaling Selected Feature to Permeat Normal Distribution

In [4]:
# NOTE: X = independent variable[month, day of week, hour_of_Day,c_temp,c_humidity,c_windspeed, c_dew,c_visibility,c_solarenergy]
#       Y = dependent variable  [bike_usage]

ind_v = pred_dataset[['month','day_of_week','hour_of_day','c_temp', 'c_humidity','c_windspeed','c_dew',
'c_visibility','c_solarenergy']]

# USING THE STANDARD SCALER TO NORMALIZE THE INDEPENDENT VARIABLES TO AVOID SKEWED DATA
scaler = StandardScaler()
scaled_ind_v= scaler.fit_transform(ind_v)

X = scaled_ind_v

Y = pred_dataset['bikeid']

# NOW SPLITTING OUR DATA IN TWO SET FOR TRAINING AND TESTING PURPOSE

In [5]:
# TRAINING SET: Set of data our Model is going to learn from 
# TESTING SET : Set of data to test how well our model have learnt before deploying

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


# MODELLING
I am using the NETELASTIC MODEL to fit our data to avoid possible overfitting, thereby enforcing regularization of our model

In [6]:

model = ElasticNet(alpha=1.0, l1_ratio=0.5)
# Note alpha =1.0 indicates the strength of the regularization on our model 
# l1_ratio = 0.5 indicates equal proportion of the laso and ridge regularization models

#Train Model
model.fit(X_train,y_train)

#Make predictions

y_pred = model.predict(X_test)

# Get the coefficients of the independent variables
coefficients = model.coef_

# Display the coefficients
print("Coefficients of the independent variables:")
for feature, coef in zip(ind_v.columns, coefficients):
    print(f"{feature} : {coef}")

prediction = pd.DataFrame(y_pred, columns =['Bike Usage Prediction'])
print(prediction)

Coefficients of the independent variables:
month : -3.2214826915572554
day_of_week : -19.666807133301422
hour_of_day : -171.6627702105404
c_temp : 28.591042453915584
c_humidity : -24.402022679137048
c_windspeed : -4.768038490246373
c_dew : 9.533176591336938
c_visibility : 16.81905214189697
c_solarenergy : 20.760010964530814
     Bike Usage Prediction
0               412.995497
1               200.288265
2               387.539629
3               414.043782
4               425.806660
..                     ...
665             420.650073
666             371.532526
667             456.677296
668             152.472039
669             232.925629

[670 rows x 1 columns]


# REAL-LIFE PREDICTION
ANSWER TO QUESTION: PREDICTING THE NUMBER OF BIKES NEEDED AT LUNCH USING WEATHER AND DATE AS INDEPENDENT VARIABLE
AND KNOWING THE NUMBER OF CHARGING STATIONS NEEDED
FRIST LET CONSTRUCT A REAL-LIFE DATA TO TEST OUR MODEL I WILL BE USING TODAY


In [7]:
month = 8
day_of_week =2
hour_of_day = 12
temperature = -22
humidity = 38
wind_speed =16
dew = 4
visibility = 14
solarenery = 1.5

new_data =[[month,day_of_week,hour_of_day,temperature,humidity,wind_speed, dew,
            visibility,solarenery]]
scaled_new_data =scaler.fit_transform(new_data)
y_pred_r = model.predict(scaled_new_data)
print(y_pred_r)

[314.1573071]


# MODEL EVALUATION

In [8]:
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')


Mean Absolute Error: 481.5204175718372
