In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import missingno as msno
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)


from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB


from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error # for regression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score  # for classification
 

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
df = train.copy()
test_df = test.copy()
df.head()


In [None]:
df.columns.unique()


###### A SHORT DESCRIPTION OF THE FEATURES.

datetime - hourly date + timestamp  

season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 

holiday - whether the day is considered a holiday

workingday - whether the day is neither a weekend nor holiday

weather -

1: Clear, Few clouds, Partly cloudy, Partly cloudy 

2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 

3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 

4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 

temp - temperature in Celsius


humidity - relative humidity

windspeed - wind speed

casual - number of non-registered user rentals initiated

registered - number of registered user rentals initiated

count - number of total rentals

######  HERE ALL THE VARIABLES OR FEATURES ARE NUMERIC AND THE TARGET VARIABLE THAT WE HAVE TO PREDICT IS THE count VARIABLE. HENCE THIS IS A TYPICAL EXAMPLE OF A REGRESSION PROBLEM AS THE count VARIABLE IS CONTINUOUS VARIED.

In [None]:
df.info()


In [None]:
df.isnull().sum()


In [None]:
msno.matrix(df)


In [None]:
df.season.value_counts()


In [None]:
sns.countplot(x="season", data=df)


In [None]:
# holiday
df.holiday.value_counts()
sns.factorplot(
    x="holiday", data=df, kind="count", size=5, aspect=1)

In [None]:
# Majority of the bike rides taken is on a working day
df.workingday.value_counts()
sns.factorplot(x="workingday", data=df, kind="count", size=5, aspect=1)

In [None]:
# It can be seen that during summer most rides are taken
sns.countplot(x="weather", data=df)
# 1-> spring
# 2-> summer
# 3-> fall
# 4-> winter


######  NOW WE CAN  ALSO SEE DISTRIBUTION OF CONTINOUS VARIABLES.

In [None]:
df.describe()


In [None]:
sns.boxplot(
    data=df[["temp", "atemp", "humidity", "windspeed", "casual", "registered", "count"]]
)
fig = plt.gcf()
fig.set_size_inches(10, 10)


In [None]:
# can also be visulaized using histograms for all the continuous variables.
df.temp.unique()
fig, axes = plt.subplots(2, 2)
axes[0, 0].hist(x="temp", data=df, edgecolor="black", linewidth=2, color="#ff4125")
axes[0, 0].set_title("Variation of temp")
axes[0, 1].hist(x="atemp", data=df, edgecolor="black", linewidth=2, color="#ff4125")
axes[0, 1].set_title("Variation of atemp")
axes[1, 0].hist(x="windspeed", data=df, edgecolor="black", linewidth=2, color="#ff4125")
axes[1, 0].set_title("Variation of windspeed")
axes[1, 1].hist(x="humidity", data=df, edgecolor="black", linewidth=2, color="#ff4125")
axes[1, 1].set_title("Variation of humidity")
fig.set_size_inches(10, 10)


######  NOW AFTER SEEING THE DISTRIBUTION OF VARIOUS DISCRETE AS WELL AS CONTINUOUS VARIABLES WE CAN SEE THE INTERREALTION B/W THEM USING A HEAT MAP.

In [None]:
# corelation matrix.
cor_mat = df[:].corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig = plt.gcf()
fig.set_size_inches(30, 12)
sns.heatmap(data=cor_mat, mask=mask, square=True, annot=True, cbar=True)


In [None]:
# Separating each of the seasons and concatenating it to the dataframe.
season = pd.get_dummies(df["season"], prefix="season")
df = pd.concat([df, season], axis=1)
df.head()
season = pd.get_dummies(test_df["season"], prefix="season")
test_df = pd.concat([test_df, season], axis=1)
test_df.head()


In [None]:
# One hot encoding each weather attribute and concatenating it to the dataframe
weather = pd.get_dummies(df["weather"], prefix="weather")
df = pd.concat([df, weather], axis=1)
df.head()
weather = pd.get_dummies(test_df["weather"], prefix="weather")
test_df = pd.concat([test_df, weather], axis=1)
test_df.head()


In [None]:
# Dropping weather and season.
df.drop(["season", "weather"], inplace=True, axis=1)
df.head()
test_df.drop(["season", "weather"], inplace=True, axis=1)
test_df.head()


In [None]:
# Splitting the day,month,hourand year from the datatime column

df["hour"] = [t.hour for t in pd.DatetimeIndex(df.datetime)]
df["day"] = [t.dayofweek for t in pd.DatetimeIndex(df.datetime)]
df["month"] = [t.month for t in pd.DatetimeIndex(df.datetime)]
df["year"] = [t.year for t in pd.DatetimeIndex(df.datetime)]
df["year"] = df["year"].map({2011: 0, 2012: 1})
df.head()


In [None]:
test_df["hour"] = [t.hour for t in pd.DatetimeIndex(test_df.datetime)]
test_df["day"] = [t.dayofweek for t in pd.DatetimeIndex(test_df.datetime)]
test_df["month"] = [t.month for t in pd.DatetimeIndex(test_df.datetime)]
test_df["year"] = [t.year for t in pd.DatetimeIndex(test_df.datetime)]
test_df["year"] = test_df["year"].map({2011: 0, 2012: 1})
test_df.head()


In [None]:
df.drop("datetime", axis=1, inplace=True)
df.head()


## NOW LETS HAVE A LOOK AT OUR NEW FEATURES.

In [None]:
cor_mat = df[:].corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig = plt.gcf()
fig.set_size_inches(30, 12)
sns.heatmap(data=cor_mat, mask=mask, square=True, annot=True, cbar=True)


In [None]:
df.drop(["casual", "registered"], axis=1, inplace=True)


In [None]:
df.head()


In [None]:
sns.catplot(x="hour", y="count", data=df, strip="bar", height=5, aspect=1.5)
plt.show()


The highest demand is in hours from say 7-10 and the from 15-19. this is because in most of the metroploitan cities this is the peak office time and so more people would be renting bikes. 


In [None]:
sns.factorplot(x="month", y="count", data=df, kind="bar", size=5, aspect=1.5)
# The season effects whether people take bike or not.


In [None]:
sns.factorplot(x="year", y="count", data=df, kind="bar", size=5, aspect=1.5)
# 0 for 2011 and 1 for 2012. Hence demand has increased over the years.


In [None]:
sns.countplot(x="day", data=df)


In [None]:
#Convert the temperature to bins to find the temperature range in which the bike sharing demand is the most.


###### note that this way this is hard to visualze. a better way is to convert the 'temp' variable into intervals or so called bins and then treat it like a discrete variable.

In [None]:
new_df = df.copy()
new_df.temp.describe()
new_df["temp_bin"] = np.floor(new_df["temp"]) // 5
new_df["temp_bin"].unique()
# now we can visualize as follows
sns.factorplot(x="temp_bin", y="count", data=new_df, kind="bar")
plt.show()


#The bins 6,7 have the highest ride share demand which corresponds to temperature range between 30-35

In [None]:
df.head()


In [None]:
df.columns.to_series().groupby(df.dtypes).groups


In [41]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeClassifier


In [None]:
X = df.drop("count", axis=1)
y = df["count"]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# Fitting a Decision Tree
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
res = dt.fit(x_train, y_train)
y_pred = res.predict(x_train)
y_pred = res.predict(x_train)
accuracy_score(y_train, y_pred)


In [42]:
model_list = [
    RandomForestRegressor(),
    AdaBoostRegressor(),
    BaggingRegressor(),
    SVR(),
]


In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train, y_pred)


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [None]:
nomimal_variables = ["season", "weather"]
nominal_encoder = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("nom_encode", nominal_encoder, nomimal_variables),
    ]
)
mse_log = {}
for model in model_list:
    clf = Pipeline(
        steps=[
            ("prep", preprocessor),
            ("model", model),
        ]
    )
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    mse = mean_squared_error(y_pred, y_test)
    mse_log[str(model)] = mse

In [None]:
model1.fit()
res = dt.fit(x_train, y_train)
y_pred = res.predict(x_train)
y_pred = res.predict(x_train)

In [None]:
# Comparing the mse values for the different models
# It can be seen that the Random Forest Model performs the best. It does not suffer from overfitting
