In [None]:
'''

Linear regression is a supervised learning algorithm where our target variable is Numerical/continuos.
In linear regression we are trying draw a best fit line for which we are getting the minimum sum of squared error.

Best fit  line: the line for which the error between the predicted value and the actual value is minimum.

'''

In [1]:
import pandas as pd
import numpy as np

In [2]:
# load the training dataset
bike_data = pd.read_csv(r'C:\Users\vansh\Desktop\PC\ML\Sunstone\Data\daily-bike-share.csv')
# bike_data = pd.read_excel(r'C:\Users\vansh\Desktop\PC\CareerEra\ML\data\daily-bike-share.xlsx')

In [4]:
bike_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,rentals
0,1,1/1/2011,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331
1,2,1/2/2011,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131
2,3,1/3/2011,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120
3,4,1/4/2011,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108
4,5,1/5/2011,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82


The data consists of the following columns:

- **instant**: A unique row identifier
- **dteday**: The date on which the data was observed - in this case, the data was collected daily; so there's one row per date.
- **season**: A numerically encoded value indicating the season (1:spring, 2:summer, 3:fall, 4:winter)
- **yr**: The year of the study in which the observation was made (the study took place over two years - year 0 represents 2011, and year 1 represents 2012)
- **mnth**: The calendar month in which the observation was made (1:January ... 12:December)
- **holiday**: A binary value indicating whether or not the observation was made on a public holiday)
- **weekday**: The day of the week on which the observation was made (0:Sunday ... 6:Saturday)
- **workingday**: A binary value indicating whether or not the day is a working day (not a weekend or holiday)
- **weathersit**: A categorical value indicating the weather situation (1:clear, 2:mist/cloud, 3:light rain/snow, 4:heavy rain/hail/snow/fog)
- **temp**: The temperature in celsius (normalized)
- **atemp**: The apparent ("feels-like") temperature in celsius (normalized)
- **hum**: The humidity level (normalized)
- **windspeed**: The windspeed (normalized)
- **rentals**: The number of bicycle rentals recorded.

In this dataset, **rentals** represents the label (the *y* value) our model must be trained to predict. The other columns are potential features (*x* values).

As mentioned previously, you can perform some *feature engineering* to combine or derive new features. For example, let's add a new column named **day** to the dataframe by extracting the day component from the existing **dteday** column. The new column represents the day of the month from 1 to 31.

## Train a Regression Model

Now that we've explored the data, it's time to use it to train a regression model that uses the features we've identified as potentially predictive to predict the **rentals** label.  The first thing we need to do is to separate the features we want to use to train the model from the label we want it to predict.

In [13]:
bike_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,rentals
0,1,1/1/2011,Spring,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331
1,2,1/2/2011,Spring,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131
2,3,1/3/2011,Spring,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120
3,4,1/4/2011,Spring,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108
4,5,1/5/2011,Spring,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82


In [15]:
# #Encoding: We convert our categorical into the numerical label.
# bike_data['season']=bike_data['season'].map({1:'Spring', 2:'Summer', 3:'Fall', 4:'Winter'})
bike_data['season']=bike_data['season'].map({'Spring':1,'Summer':2, 'Fall':3, 'Winter':4})

In [16]:
bike_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    int64  
 3   yr          731 non-null    int64  
 4   mnth        731 non-null    int64  
 5   holiday     731 non-null    int64  
 6   weekday     731 non-null    int64  
 7   workingday  731 non-null    int64  
 8   weathersit  731 non-null    int64  
 9   temp        731 non-null    float64
 10  atemp       731 non-null    float64
 11  hum         731 non-null    float64
 12  windspeed   731 non-null    float64
 13  rentals     731 non-null    int64  
dtypes: float64(4), int64(9), object(1)
memory usage: 80.1+ KB


In [9]:
bike_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,rentals
0,1,1/1/2011,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331
1,2,1/2/2011,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131
2,3,1/3/2011,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120
3,4,1/4/2011,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108
4,5,1/5/2011,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82


In [17]:
type(bike_data)

pandas.core.frame.DataFrame

In [23]:
bike_data[features].values

array([[ 1.      ,  1.      ,  0.      , ...,  0.363625,  0.805833,
         0.160446],
       [ 1.      ,  1.      ,  0.      , ...,  0.353739,  0.696087,
         0.248539],
       [ 1.      ,  1.      ,  0.      , ...,  0.189405,  0.437273,
         0.248309],
       ...,
       [ 1.      , 12.      ,  0.      , ...,  0.2424  ,  0.752917,
         0.124383],
       [ 1.      , 12.      ,  0.      , ...,  0.2317  ,  0.483333,
         0.350754],
       [ 1.      , 12.      ,  0.      , ...,  0.223487,  0.5775  ,
         0.154846]])

In [24]:
# Separate features and labels
features=['season','mnth', 'holiday','weekday','workingday','weathersit','temp', 'atemp', 'hum', 'windspeed']
X = bike_data[features].values


y = bike_data['rentals'].values


print('Features:',X[:10], '\nLabels:', y[:10], sep='\n')

Features:
[[1.        1.        0.        6.        0.        2.        0.344167
  0.363625  0.805833  0.160446 ]
 [1.        1.        0.        0.        0.        2.        0.363478
  0.353739  0.696087  0.248539 ]
 [1.        1.        0.        1.        1.        1.        0.196364
  0.189405  0.437273  0.248309 ]
 [1.        1.        0.        2.        1.        1.        0.2
  0.212122  0.590435  0.160296 ]
 [1.        1.        0.        3.        1.        1.        0.226957
  0.22927   0.436957  0.1869   ]
 [1.        1.        0.        4.        1.        1.        0.204348
  0.233209  0.518261  0.0895652]
 [1.        1.        0.        5.        1.        2.        0.196522
  0.208839  0.498696  0.168726 ]
 [1.        1.        0.        6.        0.        2.        0.165
  0.162254  0.535833  0.266804 ]
 [1.        1.        0.        0.        0.        1.        0.138333
  0.116175  0.434167  0.36195  ]
 [1.        1.        0.        1.        1.        1.        

In [12]:
type(X)

numpy.ndarray

In [13]:
type(y)

numpy.ndarray

After separating the dataset, we now have numpy arrays named **X** containing the features, and **y** containing the labels.

We *could* train a model using all of the data; but it's common practice in supervised learning to split the data into two subsets; a (typically larger) set with which to train the model, and a smaller "hold-back" set with which to validate the trained model. This enables us to evaluate how well the model performs when used with the validation dataset by comparing the predicted labels to the known labels. It's important to split the data *randomly* (rather than say, taking the first 70% of the data for training and keeping the rest for validation). This helps ensure that the two subsets of data are statistically comparable (so we validate the model with data that has a similar statistical distribution to the data on which it was trained).

To randomly split the data, we'll use the **train_test_split** function in the **scikit-learn** library. This library is one of the most widely used machine learning packages for Python.

In [25]:
from sklearn.model_selection import train_test_split


In [26]:
# Split data 70%-30% into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)


# '''
# 1: training data feature
# 2: testing data feature
# 3: training data target
# 4: testing data target
# '''


In [17]:
y_test

array([2355, 1070,  244,   64,  397,  618, 1249, 1052,  854, 1004, 2234,
       1544, 1196,  117, 1750, 2622, 1338,  257,  694,  753,  661,  188,
       1401,  979,  320,  905,   75,  121,  331,  639,  441,  975,  148,
        246,  190,  634,  848, 3065,  699,  676, 1415, 1180,  284, 2229,
        884, 2235,  795,   87,  192,  758,  532, 1988, 1070, 3283, 2230,
        909,  221, 2166,  120, 1236, 1639, 2001,  998,   83,  394, 2397,
       1128,  721,  891,  131,  247,   34,  305, 1054, 1348,  120, 1051,
        240,  217,  217, 1433,  195,  694,  830,  269,  214,  377,  653,
        983,  662,  354, 1100,   67,  432,  797,  198, 1729,  199,   42,
        349,  410,  845,  885,  692,  952,  833,   65,  922, 1135,  325,
       3410,   15, 2135,  300, 1008,  871, 1435,  640, 1405, 1603,  229,
       2204,  140,  691,  430,   69,  140, 1188,  100,  846,   38,  801,
       2207, 1576,  968,  815,  724,  206, 1557,  428, 1334, 1782,  834,
         47,  706, 1081,  611], dtype=int64)

In [18]:
print ('Training Set: %d rows\nTest Set: %d rows' % (X_train.shape[0], X_test.shape[0]))

Training Set: 584 rows
Test Set: 147 rows


Now we have the following four datasets:

- **X_train**: The feature values we'll use to train the model
- **y_train**: The corresponding labels we'll use to train the model
- **X_test**: The feature values we'll use to validate the model
- **y_test**: The corresponding labels we'll use to validate the model

Now we're ready to train a model by fitting a suitable regression algorithm to the training data. We'll use a *linear regression* algorithm, a common starting point for regression that works by trying to find a linear relationship between the *X* values and the *y* label. The resulting model is a function that conceptually defines a line where every possible X and y value combination intersect.

In Scikit-Learn, training algorithms are encapsulated in *estimators*, and in this case we'll use the **LinearRegression** estimator to train a linear regression model.

In [27]:
# Train the model
from sklearn.linear_model import LinearRegression

In [28]:
# Fit a linear regression model on the training set

model = LinearRegression().fit(X_train, y_train)

print (model)

LinearRegression()


### Evaluate the Trained Model

Now that we've trained the model, we can use it to predict rental counts for the features we held back in our validation dataset. Then we can compare these predictions to the actual label values to evaluate how well (or not!) the model is working.

In [29]:
import numpy as np
predictions = model.predict(X_test)

In [32]:
 #predicted
print('Predicted labels: ', np.round(predictions, 0)[:10])
print('Actual labels   : ' ,y_test[:10])

Predicted labels:  [ 4.330e+02 -9.000e+00  1.150e+02  1.479e+03  9.360e+02 -1.000e+00
  5.590e+02  8.630e+02  1.489e+03  6.340e+02]
Actual labels   :  [ 618   74  203 1338  854   47  318  435 2230  502]


There's a definite diagonal trend, and the intersections of the predicted and actual values are generally following the path of the trend line; but there's a fair amount of difference between the ideal function represented by the line and the results. This variance represents the *residuals* of the model - in other words, the difference between the label predicted when the model applies the coefficients it learned during training to the validation data, and the actual value of the validation label. These residuals when evaluated from the validation data indicate the expected level of *error* when the model is used with new data for which the label is unknown.

You can quantify the residuals by calculating a number of commonly used evaluation metrics. We'll focus on the following three:

- **Mean Square Error (MSE)**: The mean of the squared differences between predicted and actual values. This yields a relative metric in which the smaller the value, the better the fit of the model. mse tells how close a regression line is to a set of points.
- **Root Mean Square Error (RMSE)**: The square root of the MSE. This yields an absolute metric in the same unit as the label (in this case, numbers of rentals). The smaller the value, the better the model (in a simplistic sense, it represents the average number of rentals by which the predictions are wrong!)
- **Coefficient of Determination (usually known as *R-squared* or R<sup>2</sup>**: A relative metric in which the higher the value, the better the fit of the model. In essence, this metric represents how much of the variance between predicted and actual label values the model is able to explain. How much variablity of data is captured by the model

> **Note**: You can find out more about these and other metrics for evaluating regression models in the [Scikit-Learn documentation](https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics)

Let's use Scikit-Learn to calculate these metrics for our model, based on the predictions it generated for the validation data.

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, predictions)
print("MSE:", mse)

rmse = np.sqrt(mse)
print("RMSE:", rmse)

r2 = r2_score(y_test, predictions)
print("R2:", r2)

> **Note**: The use of random values in the Gradient Boosting algorithm results in slightly different metrics each time. In this case, the best model produced by hyperparameter tuning is unlikely to be significantly better than one trained with the default hyperparameter values; but it's still useful to know about the hyperparameter tuning technique!

## Preprocess the Data

We trained a model with data that was loaded straight from a source file, with only moderately successful results.

In practice, it's common to perform some preprocessing of the data to make it easier for the algorithm to fit a model to it. There's a huge range of preprocessing transformations you can perform to get your data ready for modeling, but we'll limit ourselves to a few common techniques:

### Scaling numeric features

Normalizing numeric features so they're on the same scale prevents features with large values from producing coefficients that disproportionately affect the predictions. For example, suppose your data includes the following numeric features:

| A |  B  |  C  |
| - | --- | --- |
| 3 | 480 | 65  |
    
Normalizing these features to the same scale may result in the following values (assuming A contains values from 0 to 10, B contains values from 0 to 1000, and C contains values from 0 to 100):

|  A  |  B  |  C  |
| --  | --- | --- |
| 0.3 | 0.48| 0.65|

There are multiple ways you can scale numeric data, such as calculating the minimum and maximum values for each column and assigning a proportional value between 0 and 1, or by using the mean and standard deviation of a normally distributed variable to maintain the same *spread* of values on a different scale.

### Encoding categorical variables

Machine learning models work best with numeric features rather than text values, so you generally need to convert categorical features into numeric representations.  For example, suppose your data includes the following categorical feature. 

| Size |
| ---- |
|  S   |
|  M   |
|  L   |

You can apply *ordinal encoding* to substitute a unique integer value for each category, like this:

| Size |
| ---- |
|  0   |
|  1   |
|  2   |

Another common technique is to use *one hot encoding* to create individual binary (0 or 1) features for each possible category value. For example, you could use one-hot encoding to translate the possible categories into binary columns like this:

|  Size_S  |  Size_M  |  Size_L  |
| -------  | -------- | -------- |
|    1     |     0    |    0     |
|    0     |     1    |    0     |
|    0     |     0    |    1     |

To apply these preprocessing transformations to the bike rental, we'll make use of a Scikit-Learn feature named *pipelines*. These enable us to define a set of preprocessing steps that end with an algorithm. You can then fit the entire pipeline to the data, so that the model encapsulates all of the preprocessing steps as well as the regression algorithm. This is useful, because when we want to use the model to predict values from new data, we need to apply the same transformations (based on the same statistical distributions and category encodings used with the training data).

>**Note**: The term *pipeline* is used extensively in machine learning, often to mean very different things! In this context, we're using it to refer to pipeline objects in Scikit-Learn, but you may see it used elsewhere to mean something else.

The pipeline is composed of the transformations and the algorithm used to train the model. To try an alternative algorithm you can just change that step to a different kind of estimator.

We've now seen a number of common techniques used to train predictive models for regression. In a real project, you'd likely try a few more algorithms, hyperparameters, and preprocessing transformations; but by now you should have got the general idea. Let's explore how you can use the trained model with new data.

### Use the Trained Model

First, let's save the model.

In [40]:
model

In [41]:
pwd

'C:\\Users\\vansh\\Desktop\\PC\\ML\\CareerEra\\ML'

In [128]:
import joblib

In [129]:
# Save the model as a pickle file
bike_model_pkl = './models/model_25feb.pkl'

joblib.dump(model_ran, bike_model_pkl) #writing step

['./models/model_25feb.pkl']

Now, we can load it whenever we need it, and use it to predict labels for new data. This is often called *scoring* or *inferencing*.

In [43]:
# Load the model from the file
loaded_model = joblib.load(bike_model_pkl)

In [130]:
X_train[0]

array([3.      , 7.      , 0.      , 3.      , 1.      , 1.      ,
       0.72    , 0.685633, 0.743333, 0.149883])

In [None]:
bike_data.head()

In [None]:
season=float(input("Enter the season"))
month=float(input("Enter the month"))
holiday=float(input("Enter the holiday"))
weekday=float(input("Enter the weekday"))
workingday=float(input("Enter the workingday"))
weathersit=float(input("Enter the weathersit"))
temp=float(input("Enter the temp"))
atemp=float(input("Enter the atemp"))
hum=float(input("Enter the hum"))
windspeed=float(input("Enter the windspeed"))

In [None]:
X_new = np.array([[season,month,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed]])
result = loaded_model.predict(X_new)
print('Prediction: {:.0f} rentals'.format(np.round(result[0])))


In [None]:
X_new

In [None]:
# Create a numpy array containing a new observation (for example tomorrow's seasonal and weather forecast information)
X_new = np.array([[1,6,1,0,1,2,0.326957,0.02927,0.686957,0.1869]]).astype('float64')
print ('New sample: {}'.format(list(X_new[0])))

In [None]:
# Use the model to predict tomorrow's rentals
result = loaded_model.predict(X_new)
print('Prediction: {:.0f} rentals'.format(np.round(result[0])))

In [None]:
result[0]

The model's **predict** method accepts an array of observations, so you can use it to generate multiple predictions as a batch. For example, suppose you have a weather forecast for the next five days; you could use the model to predict bike rentals for each day based on the expected weather conditions.

In [44]:
# An array of features based on five-day weather forecast
X_new = np.array([[0,1,1,0,0,1,0.344167,0.363625,0.805833,0.16044],
                  [0,1,0,1,0,1,0.363478,0.353739,0.696087,0.248539],
                  [0,1,0,2,0,1,0.196364,0.189405,0.437273,0.248309],
                  [0,1,0,3,0,1,0.2,0.212122,0.590435,0.160296],
                  [0,1,0,4,0,1,0.226957,0.22927,0.436957,0.1869]])

In [45]:
X_new

array([[0.      , 1.      , 1.      , 0.      , 0.      , 1.      ,
        0.344167, 0.363625, 0.805833, 0.16044 ],
       [0.      , 1.      , 0.      , 1.      , 0.      , 1.      ,
        0.363478, 0.353739, 0.696087, 0.248539],
       [0.      , 1.      , 0.      , 2.      , 0.      , 1.      ,
        0.196364, 0.189405, 0.437273, 0.248309],
       [0.      , 1.      , 0.      , 3.      , 0.      , 1.      ,
        0.2     , 0.212122, 0.590435, 0.160296],
       [0.      , 1.      , 0.      , 4.      , 0.      , 1.      ,
        0.226957, 0.22927 , 0.436957, 0.1869  ]])

In [48]:
# Use the model to predict rentals
results = loaded_model.predict(X_new)

In [49]:
results

array([ 705.71367393, 1048.09639411,  842.93913103,  888.30964665,
       1006.40533682])

In [None]:
results

In [None]:
print('5-day rental predictions:')
for prediction in results:
    print(np.round(prediction))