Build a regression model.

In [50]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import eli5
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import matplotlib.pyplot as plt

In [33]:
# Connect to the SQLite database
conn = sqlite3.connect("database.sqlite")

# Load the data into a Pandas dataframe
df = pd.read_sql_query("SELECT * FROM merged_data", conn)

# Close the connection to the SQLite database
conn.close()

In [34]:
df.head()

Unnamed: 0,index,name_x,latitude,longitude,bikes_available,fsq_id,category_id,category_name,chains,distance_x,...,location_postcode,location_region,location_timezone,name_y,name,rating,review_count,price,distance_y,category
0,0,Queen St E / Woodward Ave,43.665,-79.32,13.0,4deb8ba688774880e3387c0c,17065.0,Farmers' Market,[],40.0,...,,ON,America/Toronto,Leslieville Farmers Market,Descendant Detroit Style Pizza,4.5,418.0,$$,1048.126922,
1,1,Queen St E / Woodward Ave,43.665,-79.32,13.0,4deb8ba688774880e3387c0c,17065.0,Farmers' Market,[],40.0,...,,ON,America/Toronto,Leslieville Farmers Market,Maha's,4.0,572.0,$$,1005.190035,
2,2,Queen St E / Woodward Ave,43.665,-79.32,13.0,4deb8ba688774880e3387c0c,17065.0,Farmers' Market,[],40.0,...,,ON,America/Toronto,Leslieville Farmers Market,The Burger's Priest,3.5,503.0,$$,374.238073,
3,3,Queen St E / Woodward Ave,43.665,-79.32,13.0,4deb8ba688774880e3387c0c,17065.0,Farmers' Market,[],40.0,...,,ON,America/Toronto,Leslieville Farmers Market,Completo,4.5,169.0,$,1184.032903,
4,4,Queen St E / Woodward Ave,43.665,-79.32,13.0,4deb8ba688774880e3387c0c,17065.0,Farmers' Market,[],40.0,...,,ON,America/Toronto,Leslieville Farmers Market,Ed's Real Scoop,4.5,209.0,$$,1854.179445,


In [35]:
# Get the summary statistics of the data
print(df.describe())

             index     latitude    longitude  bikes_available   category_id  \
count  1225.000000  1225.000000  1225.000000      1167.000000    528.000000   
mean    612.000000    43.665832   -79.384929         6.882605  14856.606061   
std     353.771348     0.021412     0.048172         5.247634   2646.578915   
min       0.000000    43.588000   -79.546000         0.000000  10004.000000   
25%     306.000000    43.656000   -79.403000         2.000000  13064.000000   
50%     612.000000    43.664000   -79.383000         6.000000  16000.000000   
75%     918.000000    43.674000   -79.376000         9.000000  17065.000000   
max    1224.000000    43.788000   -79.124000        52.000000  19055.000000   

       distance_x      rating  review_count   distance_y  
count  528.000000  520.000000    520.000000   520.000000  
mean    62.392045    4.200962    721.182692  1189.970960  
std     87.346307    0.315769    668.199605   777.628970  
min      1.000000    3.500000     67.000000    16.62

In [36]:
df = df.drop(["fsq_id","category_id","location_postcode","location_region","location_timezone",'name_x',
              "name_y","name","chains","location_country","location_cross_street","location_formatted_address","location_locality","category_name","category"], axis=1)

df.head()

Unnamed: 0,index,latitude,longitude,bikes_available,distance_x,rating,review_count,price,distance_y
0,0,43.665,-79.32,13.0,40.0,4.5,418.0,$$,1048.126922
1,1,43.665,-79.32,13.0,40.0,4.0,572.0,$$,1005.190035
2,2,43.665,-79.32,13.0,40.0,3.5,503.0,$$,374.238073
3,3,43.665,-79.32,13.0,40.0,4.5,169.0,$,1184.032903
4,4,43.665,-79.32,13.0,40.0,4.5,209.0,$$,1854.179445


In [37]:
# Convert the strings into numbers
df.price = df.price.map({'$': 0, '$$': 1, '$$$': 2, '$$$$': 3})

# Reshaping into a 2D array with a single feature (column)
price = df.price.values.reshape(-1, 1)

# Initializing the OneHotEncoder
encoder = OneHotEncoder()
encoder.fit(price)

# Transforming the input data into one-hot-encoded format
one_hot_encoded = encoder.transform(price).toarray()

In [38]:
df = df.fillna(0)

Provide model output and an interpretation of the results. 

In [39]:
# Split the data into features (X) and target (y)
X = df.drop("bikes_available", axis=1)
y = df["bikes_available"]

In [48]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Llinear regression model to the training data
reg = LinearRegression().fit(X_train, y_train)

# Predict the target values
y_pred = reg.predict(X_test)

# Evaluate the model performance
print("R-squared:", reg.score(X_test, y_test))

# add an intercept term to the X_train data
X_train = sm.add_constant(X_train)

# create a model and fit it to the training data using statsmodels
model = sm.OLS(y_train, X_train).fit()

# print the summary of the model
print(model.summary())

R-squared: 0.17253422681327923
                            OLS Regression Results                            
Dep. Variable:        bikes_available   R-squared:                       0.158
Model:                            OLS   Adj. R-squared:                  0.151
Method:                 Least Squares   F-statistic:                     22.84
Date:                Wed, 15 Feb 2023   Prob (F-statistic):           3.52e-32
Time:                        17:47:16   Log-Likelihood:                -2940.5
No. Observations:                 980   AIC:                             5899.
Df Residuals:                     971   BIC:                             5943.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         129

In [41]:
## Model explaination
## Though we some the R2 of 0.17, the correlation is too weak to say the model is able to predict the number of bikes available at a given location.
## We can say that the variables within Yelp or Foursquare data are too limited and maybe additional variables 
## such as population or closeness to office or subways station might have been a better input.

Explanation(estimator='LinearRegression()', description=None, error=None, method='linear model', is_regression=True, targets=[TargetExplanation(target='y', feature_weights=FeatureWeights(pos=[FeatureWeight(feature='<BIAS>', weight=1609.557695562332, std=None, value=1.0)], neg=[FeatureWeight(feature='latitude', weight=-1411.01097810645, std=None, value=43.683), FeatureWeight(feature='longitude', weight=-188.25949279313863, std=None, value=-79.419), FeatureWeight(feature='index', weight=-2.3369257602730826, std=None, value=673.0)], pos_remaining=0, neg_remaining=0), proba=None, score=7.950298902470195, weighted_spans=None, heatmap=None)], feature_importances=None, decision_tree=None, highlight_spaces=None, transition_features=None, image=None)


# Stretch

How can you turn the regression model into a classification model?