In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
df=pd.read_csv('/content/SeoulBikeData.csv')

In [None]:
df.drop(['Date'],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
print(df['Rented Bike Count'].describe())
plt.figure(figsize=(9, 8))
sns.distplot(df['Rented Bike Count'], color='g', bins=100, hist_kws={'alpha': 0.4});

In [None]:
list(set(df.dtypes.tolist()))

In [None]:
df_num = df.select_dtypes(include = ['float64', 'int64'])
df_num.head()

In [None]:
df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8); # ; avoid having the matplotlib verbose informations

# **What are the factors that influence the number of bikes rented?**


# **Correlation**

In [None]:
df_num.cov()
## Covariance - covariance is when two items vary together.
## Covariance eg. -  when s&p increases, stock price will increase or vice versa

In [None]:
## Correlation - Correlation is when the change in one item may result in the change in another item.
## correlation eg. - when speed increased milage will decrease
df_num_corr = df_num.corr()['Rented Bike Count'][1:]
golden_features_list = df_num_corr[abs(df_num_corr) > 0.5].sort_values(ascending=False)
print("There is {} strongly correlated values with number of bikes rented are :\n{}".format(len(golden_features_list), golden_features_list))

In [None]:
df_num_nocorr = df_num.corr()['Rented Bike Count'][1:]
golden_nofeatures_list = df_num_nocorr[abs(df_num_nocorr) < 0.5].sort_values(ascending=False)
print("There is {} not strongly correlated values with number of bikes rented are :\n{}".format(len(golden_nofeatures_list), golden_nofeatures_list))

In [None]:
for i in range(1, len(df_num.columns), 3):
    sns.pairplot(data=df_num,
                x_vars=df_num.columns[i:i+3],
                y_vars=['Rented Bike Count'])

Btw, correlation by itself does not always explain the relationship between data so ploting them could even lead us to new insights and in the same manner, check that our correlated values have a linear relationship to the rented bike count.

For example, relationships such as curvilinear relationship cannot be guessed just by looking at the correlation value so lets take the features we excluded from our correlation table and plot them to see if they show some kind of pattern.

# **Explore the data for outliers and missing values.**

In [None]:
miss_val = df.isna().sum()
miss_val

In [None]:
import operator

individual_features_df = []
for i in range(1,len(df_num.columns)): 
    tmpDf = df_num[[df_num.columns[i], 'Rented Bike Count']]
    tmpDf = tmpDf[tmpDf[df_num.columns[i]] != 0]
    individual_features_df.append(tmpDf)

all_correlations = {feature.columns[0]: feature.corr()['Rented Bike Count'][0] for feature in individual_features_df}
all_correlations = sorted(all_correlations.items(), key=operator.itemgetter(1))
for (key, value) in all_correlations:
    print("{:>15}: {:>15}".format(key, value))

In [None]:
golden_features_list = [key for key, value in all_correlations if abs(value) >= 0.5]
print("There is {} strongly correlated values with Rented Bike Count:\n{}".format(len(golden_features_list), golden_features_list))


# **Feature to feature relationship**

In [None]:
corr = df_num.drop('Rented Bike Count', axis=1).corr() # We already examined SalePrice correlations
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);


A lot of features seems to be correlated between each other but some of them such as Dew Point temperature(C)/Temperature(C) may just indicate rented bike count inflation.IN a good weather (not cold) people will rent more bikes.

Now for the ones which are less obvious we can see that:

There is a strong negative correlation between Humidity(%),Visibilty(10m),Solar Radiation(MJ/M2).These factors are interesting and may indicate that people gives an importance of not renting a bike in extreme weather condition to avoid accedents.

In [None]:
quantitative_features_list = ['Rented Bike Count','Hour', 'Temperature(C)', 'Humidity(%)', 'Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(C)',
    'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)']
df_quantitative_values = df[quantitative_features_list]
df_quantitative_values.head()

In [None]:
features_to_analyse = [x for x in quantitative_features_list if x in golden_features_list]
features_to_analyse.append('Rented Bike Count')
features_to_analyse


# **Q -> Q (Quantitative to Quantitative relationship)**

In [None]:

fig, ax = plt.subplots(round(len(features_to_analyse) / 3),figsize = (18, 12))

for i, ax in enumerate(fig.axes):
    if i < len(features_to_analyse) - 1:
        sns.regplot(x=features_to_analyse[i],y='Rented Bike Count', data=df[features_to_analyse], ax=ax)


# **C -> Q (Categorical to Quantitative relationship)**

In [None]:
# quantitative_features_list[1:] as the Second column is Rented Car Price and we want to keep it
categorical_features = [a for a in quantitative_features_list[1:] + df.columns.tolist() if (a not in quantitative_features_list[1:]) or (a not in df.columns.tolist())]
df_categ = df[categorical_features]
df_categ.head()

In [None]:
df_not_num = df_categ.select_dtypes(include = ['O'])
print('There is {} non numerical features including:\n{}'.format(len(df_not_num.columns), df_not_num.columns.tolist()))

In [None]:
plt.figure(figsize = (10, 6))
ax = sns.boxplot(x='Holiday', y='Rented Bike Count', data=df_categ)
plt.setp(ax.artists, alpha=.5, linewidth=2, edgecolor="k")
plt.xticks(rotation=45)

In [None]:
plt.figure(figsize = (12, 6))
ax = sns.boxplot(x='Seasons', y='Rented Bike Count', data=df_categ)
plt.setp(ax.artists, alpha=.5, linewidth=2, edgecolor="k")
plt.xticks(rotation=45)

In [None]:
fig, axes = plt.subplots(round(len(df_not_num.columns)), 1, figsize=(12, 10))

for i, ax in enumerate(fig.axes):
    if i < len(df_not_num.columns):
        ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=45)
        sns.countplot(x=df_not_num.columns[i], alpha=0.7, data=df_not_num, ax=ax)

fig.tight_layout()

# **Please build a linear regression model to predict rented bike count by choosing appropriate independentvariables.**

What is Train/Test
Train/Test is a method to measure the accuracy of your model.

It is called Train/Test because you split the the data set into two sets: a training set and a testing set.

1.80% for training, and 20% for testing.

2.You train the model using the training set.

3.You test the model using the testing set.

4.Train the model means create the model.

5.Test the model means test the accuracy of the model.

The simplest form of the regression equation with one dependent and one independent variable is defined by the formula y = c + b*x, where y = estimated dependent variable score, c = constant, b = regression coefficient, and x = score on the independent variable.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso 

train, val = train_test_split(df_num, test_size=0.25)

In [None]:
print('Dimension of df_num_x dataframe:',df_num_x.shape)
print('Dimension of df_num_y dataframe:',df_num_y.shape)
print('Dimension of test_x dataframe:',test_x.shape)
print('Dimension of test_y dataframe:',_y.shape)


# **Ridge(L2) Regression**

In [None]:
ridge = Ridge()
ridge

In [None]:
ridge.fit(df_num_x,df_num_y)
ridge_score = ridge.score(test_x,test_y)
coeff_used = np.sum(ridge.coef_ != 0)
ridge.coef_

In [None]:
print("training score",ridge_score)
print("number of feature used:",coeff_used)


# **Lasso(L1) Regression**

In [None]:
lasso = Lasso()
lasso

In [None]:
lasso.fit(df_num_x,df_num_y)
lasso_score = lasso.score(test_x,test_y)
coeff_used = np.sum(lasso.coef_ != 0)
lasso.coef_

In [None]:
print("training score",lasso_score)
print("number of feature used:",coeff_used)

# **conclusion**
linear regression model basically finds the best value for the intercept and slope, which results in a line that best fits the data.You can see that the value of root mean squared error is 0.4364, which is less than 10% of the mean value of the temperature(C)i.e. 12.882(C). This means that our algorithm did a decent job.