In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sb 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder, StandardScaler 
from sklearn import metrics 
from sklearn.svm import SVC 
from xgboost import XGBRegressor 
from sklearn.linear_model import LinearRegression, Lasso, Ridge 
from sklearn.ensemble import RandomForestRegressor 
  
import warnings 
warnings.filterwarnings('ignore')
df = pd.read_csv('data\\Yulu.csv') 
df

# preprocessing

In [None]:
df.info()

In [None]:
df=df.dropna()
df.info()

In [None]:
df.nunique()

In [None]:
df['datetime'] = pd.to_datetime(df['datetime'])
df['day'] = df['datetime'].dt.day
df['Month'] = df['datetime'].dt.month
df['Year'] = df['datetime'].dt.year
df['Weekday'] = df['datetime'].dt.weekday
df['Hour'] = df['datetime'].dt.hour
df=df.drop('datetime',axis=1)
df.info()

In [None]:
df=df.drop(['casual','registered'],axis=1)
df.head()

In [None]:
df.nunique()

# Visualization

In [None]:
features = ['temp', 'windspeed','humidity'] 
  
plt.subplots(figsize=(15, 15)) 
for i, col in enumerate(features): 
  plt.subplot(3, 1, i + 1) 
  df.groupby(col).mean()['count'].plot() 
plt.show()

In [None]:
features = ['Weekday', 'Month','Hour'] 
  
plt.subplots(figsize=(15, 15)) 
for i, col in enumerate(features): 
  plt.subplot(3, 1, i + 1) 
  df.groupby(col).mean()['count'].plot(kind='bar') 
plt.show()

In [None]:
features = ['workingday', 'holiday','season'] 
label_mappings = {
    'workingday': {0: 'Non-Working Day', 1: 'Working Day'},
    'holiday': {0: 'Non-Holiday', 1: 'Holiday'},
    'season': {1: 'Spring', 2: 'Summer', 3: 'Fall', 4: 'Winter'}
}
  
plt.subplots(figsize=(15, 15)) 
for i, col in enumerate(features): 
  plt.subplot(3, 1, i + 1) 
  labeled_data = df[col].map(label_mappings[col])
  df.groupby(labeled_data).mean()['count'].plot(kind='pie') 
plt.show()

In [None]:
features = ['temp', 'windspeed','atemp','humidity','count']  
  
plt.subplots(figsize=(15, 15)) 
for i, col in enumerate(features): 
  plt.subplot(2, 3, i + 1) 
  sb.boxplot(df[col]) 
  plt.title(col)
plt.show()

In [None]:
L = ['windspeed','humidity','count'] 
for i, col in enumerate(L):
    Q1=df[col].quantile(0.25)
    Q2=df[col].quantile(0.5)
    Q3=df[col].quantile(0.75)
    iqr=Q3-Q1
    lowlim=Q1-1.5*iqr
    upplim=Q3+1.5*iqr
    print(f"lowlim of{col} = {lowlim}")
    print(f"upplim of{col} = {upplim}")


In [None]:
num = df[
    (df['windspeed'] > 31.992500000000003) | 
    (df['windspeed'] < -7.993100000000002) | 
    (df['humidity'] > 122.0) | 
    (df['humidity'] < 2.0)|
    (df['count']>647.0)|
    (df['count']<-321.0)
].shape[0]
print(num)

In [None]:
!pip install feature_engine
from feature_engine.outliers import ArbitraryOutlierCapper
plt.subplots(figsize=(15,5)) 
for i, col in enumerate(L):
    if i==0:
        low = -7.993100000000002
        high=31.992500000000003
    elif i==1:
        low=2.0
        high=122.0
    else:
        low=-321.0 
        high=647.0
        
    arb=ArbitraryOutlierCapper(min_capping_dict={col:low},max_capping_dict={col:high})
    df[[col]]=arb.fit_transform(df[[col]])
    plt.subplot(1, 3, i + 1) 
    sb.boxplot(df[col]) 
    plt.title(col)
plt.show()

In [None]:
sb.heatmap(df.corr() > 0.8, 
           annot=True, 
           cbar=False) 
plt.show()

In [None]:
df=df.drop(['season','atemp'],axis=1)

# Model

In [None]:
features = df.drop(['count'], axis=1) 
target = df['count']
  
xtrain, xtest, ytrain, ytest = train_test_split(features, 
                                                  target, 
                                                  test_size = 0.2, 
                                                  random_state=22) 
xtrain.shape, xtest.shape

In [None]:
from sklearn.metrics import mean_absolute_error as mae 
models = [LinearRegression(), XGBRegressor(),
          Lasso(), RandomForestRegressor(), Ridge()]

scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain) 
xval = scaler.transform(xtest) 

for i in range(5):
    models[i].fit(xtrain, ytrain) 
  
    print(f'{models[i]} : ') 
  
    train_preds = models[i].predict(xtrain) 
    print('Training Error : ', mae(ytrain, train_preds)) 
  
    val_preds = models[i].predict(xtest) 
    print('Validation Error : ', mae(ytest, val_preds)) 
    print() 

   