# Seoul Bike Sharing Demand

## Feature Engineering

In [36]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

**Have a look on Seoul Bike Sharing Demand(Data Analysis) file**
- There are no null values

In [37]:
data_frame=pd.read_csv('SeoulBikeData.csv',encoding='unicode_escape')

In [38]:
data_frame.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


## Label encoding

In [39]:
for cols in data_frame.columns:
    if data_frame[cols].dtype=='O':
        rank=data_frame[cols].value_counts().index
        mapping={i:k for k,i in enumerate(rank,0)}
        data_frame[cols]=data_frame[cols].map(mapping)

In [40]:
data_frame.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,0,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,3,0,0
1,0,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,3,0,0
2,0,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,3,0,0
3,0,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,3,0,0
4,0,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,3,0,0


In [41]:
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Date                       8760 non-null   int64  
 1   Rented Bike Count          8760 non-null   int64  
 2   Hour                       8760 non-null   int64  
 3   Temperature(°C)            8760 non-null   float64
 4   Humidity(%)                8760 non-null   int64  
 5   Wind speed (m/s)           8760 non-null   float64
 6   Visibility (10m)           8760 non-null   int64  
 7   Dew point temperature(°C)  8760 non-null   float64
 8   Solar Radiation (MJ/m2)    8760 non-null   float64
 9   Rainfall(mm)               8760 non-null   float64
 10  Snowfall (cm)              8760 non-null   float64
 11  Seasons                    8760 non-null   int64  
 12  Holiday                    8760 non-null   int64  
 13  Functioning Day            8760 non-null   int64

**There are no more categorical features in the dataframe**

## Feature Selection

In [42]:
#spliting the data
x=data_frame.drop(['Rented Bike Count'],axis=1)
y=data_frame[['Rented Bike Count']]

In [43]:
from sklearn.tree import ExtraTreeClassifier

In [44]:
model=ExtraTreeClassifier()

In [45]:
model.fit(x,y)

ExtraTreeClassifier()

In [46]:
model.feature_importances_

array([0.10971004, 0.11145831, 0.13603564, 0.12807432, 0.1296706 ,
       0.113508  , 0.13167737, 0.06655023, 0.01262033, 0.0083989 ,
       0.01350955, 0.00703421, 0.0317525 ])

In [47]:
feature_importance=pd.DataFrame({'columns':x.columns,'importance':model.feature_importances_})
feature_importance

Unnamed: 0,columns,importance
0,Date,0.10971
1,Hour,0.111458
2,Temperature(°C),0.136036
3,Humidity(%),0.128074
4,Wind speed (m/s),0.129671
5,Visibility (10m),0.113508
6,Dew point temperature(°C),0.131677
7,Solar Radiation (MJ/m2),0.06655
8,Rainfall(mm),0.01262
9,Snowfall (cm),0.008399


In [49]:
new_imp_data=feature_importance.sort_values(by='importance',ascending=False)

**Considering top 6 features**

In [51]:
features_data=new_imp_data.iloc[:6,:]

In [53]:
final_features=features_data['columns'].values

In [57]:
final_x_input_data=data_frame[final_features]
final_data=pd.concat([final_x_input_data,y],axis=1)
final_data

Unnamed: 0,Temperature(°C),Dew point temperature(°C),Wind speed (m/s),Humidity(%),Visibility (10m),Hour,Rented Bike Count
0,-5.2,-17.6,2.2,37,2000,0,254
1,-5.5,-17.6,0.8,38,2000,1,204
2,-6.0,-17.7,1.0,39,2000,2,173
3,-6.2,-17.6,0.9,40,2000,3,107
4,-6.0,-18.6,2.3,36,2000,4,78
...,...,...,...,...,...,...,...
8755,4.2,-10.3,2.6,34,1894,19,1003
8756,3.4,-9.9,2.3,37,2000,20,764
8757,2.6,-9.9,0.3,39,1968,21,694
8758,2.1,-9.8,1.0,41,1859,22,712


In [58]:
#saving the final data.

In [59]:
final_data.to_csv('final_input_data.csv',index=False)