# Bike Rental Predictions - Linear Regression (Timothy Manolias)

### The following program predicts the number of bike rentals that will occur for Capital Bikeshare System within a given hour, based on historical bike-rental data and a multiple linear regression model.

## Imports Libraries and Data

In [1]:
'''Imports Libraries.'''

import numpy as np
import pandas as pd

from sklearn import preprocessing as pre
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from itertools import combinations as cmb

In [2]:
'''Imports Data.'''

# Train data
train_data = pd.read_csv('Data/BikeSharing_train.csv')
train_data_X = train_data.drop(['cnt'], axis=1)
train_data_y = train_data[['cnt']]

# Test data
test_data = pd.read_csv('Data/BikeSharing_test.csv')
test_data_X = test_data.drop(['cnt'], axis=1)
test_data_y = test_data[['cnt']]

train_data_X.head()

Unnamed: 0,season,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
0,1,1,0,0,6,0,1,0.24,0.2879,0.81,0.0
1,1,1,1,0,6,0,1,0.22,0.2727,0.8,0.0
2,1,1,2,0,6,0,1,0.22,0.2727,0.8,0.0
3,1,1,3,0,6,0,1,0.24,0.2879,0.75,0.0
4,1,1,4,0,6,0,1,0.24,0.2879,0.75,0.0


## Data Preprocessing

#### Performs One-Hot Encoding on Categorical Variables in `train_data`:

In [3]:
categorical_cols = ["season", "mnth", "hr", "holiday", "weekday", "workingday", "weathersit"]

if "season" in train_data_X.columns: # Prevents encoding variables twice if cell is re-run
    categoricals_train = train_data_X.loc[:,categorical_cols]

    enc = pre.OneHotEncoder()
    enc.fit(categoricals_train)
    onehotlabels = enc.transform(categoricals_train).toarray()

    # Converts categorical values from digits to string values
    for col in categoricals_train.columns:
        categoricals_train[col] = categoricals_train.loc[:,[col]].astype(str)
        
    # Replaces old categorical variables with one-hot encoded variables
    categoricals_train = pd.get_dummies(categoricals_train, drop_first=True)
    train_data_X = train_data_X.drop(categorical_cols, axis=1)
    train_data_X = pd.concat((train_data_X, categoricals_train), axis=1)
    train_data_X = train_data_X.sort_index(axis=1)
else:
    print('Re-execute notebook from beginning to avoid encoding variables twice.')

#### Performs One-Hot Encoding on Categorical Variables in `test_data`:

In [4]:
if 'season' in test_data.columns:
    categoricals_test = test_data_X.loc[:,categorical_cols]

    enc2 = pre.OneHotEncoder()
    enc2.fit(categoricals_test)
    onehotlabels = enc2.transform(categoricals_test).toarray()

    # Converts categorical values to string values
    for col in categoricals_test.columns:
        categoricals_test[col] = categoricals_test.loc[:,[col]].astype(str)

    # Replaces old categorical variables with one-hot encoded variables
    categoricals_test = pd.get_dummies(categoricals_test, drop_first=True)
    test_data_X = test_data_X.drop(categorical_cols, axis=1)
    test_data_X = pd.concat((test_data_X, categoricals_test), axis=1)
    test_data_X = test_data_X.sort_index(axis=1)
else:
    print('Re-execute notebook from beginning to avoid encoding variables twice.')

## Performs Multiple Linear Regression with K-Fold Cross Validation

In [5]:
def cross_val_mse(X, y, k):
    '''Manually Performs K-Fold Cross Validation.'''
    
    # Randomizes rows
    idx = np.random.permutation(X.index)
    X, y = X.reindex(idx), y.reindex(idx)
    y_folds, X_folds, mse_total = [], [], []
    
    # Splits data into k folds for both X and y
    length = len(y) // k
    for i in range(k):
        fold_data_y = y[length*i : length*(i+1)]
        fold_data_X = X[length*i : length*(i+1)]
        
        # Adds each fold to list
        y_folds.append(fold_data_y)
        X_folds.append(fold_data_X)
    
    # Trains and tests for each test-fold
    for index in range(k):
        # Determines train and test data for y
        test_y = y_folds[index]
        train_y = y_folds[:index] + y_folds[index+1:]
        train_y = pd.concat(train_y, axis=0)

        # Determines train and test data for X
        test_X = X_folds[index]
        train_X = X_folds[:index] + X_folds[index+1:]
        train_X = pd.concat(train_X, axis=0)

        # Trains fold-model
        fold_mod = LinearRegression().fit(train_X, train_y)

        # Compares preds and y_test
        fold_mod_pred = fold_mod.predict(test_X)
        fold_mse = mean_squared_error(test_y, fold_mod_pred)
        mse_total.append(fold_mse)

    # Returns average of k mse values
    return (sum(mse_total) / len(mse_total))

In [6]:
'''Fits Linear Regression Model.'''

lin_reg_mod = LinearRegression().fit(train_data_X, train_data_y)

#### Finds Best Combination of Independent Variables

In [7]:
'''Computes All Combinations of Independent Variables.'''

orig_cols_X = list(train_data.drop(['cnt'], axis=1).columns)
total_combinations = []

counter = 1
while counter <= len(orig_cols_X):
    temp_cmb = cmb(orig_cols_X, counter)
    [total_combinations.append(i) for i in temp_cmb]
    counter += 1

    
# Maps categorical column names to encoded variable names
column_mappings = {'season'     : ['season_2', 'season_3', 'season_4'],
                   'mnth'       : ['mnth_10', 'mnth_11', 'mnth_12', 'mnth_2', 'mnth_3', 'mnth_4', 'mnth_5', 'mnth_6', 'mnth_7', 'mnth_8','mnth_9'],
                   'hr'         : ['hr_1', 'hr_10', 'hr_11', 'hr_12', 'hr_13', 'hr_14', 'hr_15', 'hr_16', 'hr_17', 'hr_18', 'hr_19', 'hr_2',
                                   'hr_20', 'hr_21', 'hr_22','hr_23', 'hr_3', 'hr_4', 'hr_5', 'hr_6', 'hr_7', 'hr_8', 'hr_9'],
                   'holiday'    : ['holiday_1'],
                   'weekday'    : ['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6'],
                   'workingday' : ['workingday_1'],
                   'weathersit' : ['workingday_1', 'weathersit_2', 'weathersit_3']}

categ_col_names = [key for key, value in column_mappings.items()]

In [8]:
def get_encoded_columns(X_vars):
    '''Replaces categorical column names with one-hot encoded column names.'''
    
    total_cols = []
    for column in X_vars:
        if column in categ_col_names:
            temp_columns = column_mappings[column]
            total_cols.extend(temp_columns)
        else:
            total_cols.append(column)
            
    return total_cols

In [9]:
'''Iterates Through Each Combination of Features to Find Lowest MSE.'''

# Initially sets first combination's mse as lowest mse
first_comb = train_data_X[get_encoded_columns(total_combinations[0])]
min_mse = cross_val_mse(first_comb, train_data_y, 5)

# Finds best combination
for combination in total_combinations:
    temp_cols = [col for col in combination]
    encoded_temp_cols = get_encoded_columns(temp_cols)
    comb_X = train_data_X[encoded_temp_cols]

    # Computes mse and checks if less than min_mse
    temp_mse = cross_val_mse(comb_X, train_data_y, 5)
    if temp_mse < min_mse:
        min_mse = temp_mse
        min_col_names = encoded_temp_cols
        
print(f'The minimum rmse for the best variables on train_data is {(min_mse) ** (1/2):,.2f}.')

The minimum rmse for the best variables on train_data is 100.69.


## Tests Model with `test_data`

In [10]:
'''Predicts test_data Using best_mod and Variables with Best Performance.'''

# Trains model using best features from prior step\
best_train = train_data_X[min_col_names]
best_mod = LinearRegression().fit(best_train, train_data_y)

# Adds one-hot encoded columns from training set that were not in test set
for i in train_data_X.columns:
    if i not in test_data_X.columns:
        test_data_X[i] = 0

# Predicts on test_data
test_data_X = test_data_X.sort_index(axis=1)
final_test_data = test_data_X[min_col_names]
y_pred = best_mod.predict(final_test_data)

# Evaulates performance of predictions
test_mse = cross_val_mse(test_data_X, test_data_y, 5)
print(f'The rmse for the test_data is {(test_mse) ** (1/2):,.2f}.')

The rmse for the test_data is 120.11.


## Results

#### After evaluating the model using only the features that minimize the rmse, the model was tested on the `test_data`. On average, the predicted number of bike rentals that will occur in a given hour was incorrect by roughly 120 bikes.