Build a regression model.

In [56]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import eli5
from sklearn.preprocessing import OneHotEncoder

In [64]:
# Connect to the SQLite database
conn = sqlite3.connect("database.sqlite")

# Load the data into a Pandas dataframe
df = pd.read_sql_query("SELECT * FROM merged_data", conn)

# Close the connection to the SQLite database
conn.close()

In [65]:
df.head()

Unnamed: 0.1,index,Unnamed: 0_x,name_x,latitude,longitude,bikes_available,Unnamed: 0_y,fsq_id,category_id,category_name,...,location_postcode,location_region,location_timezone,name_y,Unnamed: 0,name,rating,review_count,price,distance_y
0,0,0,Queen St E / Woodward Ave,43.67,-79.32,13,0,4b65b9f1f964a520fefb2ae3,13031,Burger Joint,...,M9N 2J3,ON,America/Toronto,McDonald's,0,Descendant Detroit Style Pizza,4.5,418,$$,1048.126922
1,1,0,Queen St E / Woodward Ave,43.67,-79.32,13,0,4b65b9f1f964a520fefb2ae3,13031,Burger Joint,...,M9N 2J3,ON,America/Toronto,McDonald's,1,Maha's,4.0,572,$$,1005.190035
2,2,0,Queen St E / Woodward Ave,43.67,-79.32,13,0,4b65b9f1f964a520fefb2ae3,13031,Burger Joint,...,M9N 2J3,ON,America/Toronto,McDonald's,2,The Burger's Priest,3.5,503,$$,374.238073
3,3,0,Queen St E / Woodward Ave,43.67,-79.32,13,0,4b65b9f1f964a520fefb2ae3,13031,Burger Joint,...,M9N 2J3,ON,America/Toronto,McDonald's,3,Completo,4.5,169,$,1184.032903
4,4,0,Queen St E / Woodward Ave,43.67,-79.32,13,0,4b65b9f1f964a520fefb2ae3,13031,Burger Joint,...,M9N 2J3,ON,America/Toronto,McDonald's,4,Ed's Real Scoop,4.5,209,$$,1854.179445


In [66]:
df = df.drop(["Unnamed: 0_x","name_x", "Unnamed: 0_y","fsq_id","category_id","location_postcode","location_region","location_timezone",
              "name_y","Unnamed: 0","name","chains","location_country","location_cross_street","location_formatted_address","location_locality","category_name"], axis=1)

df.head()

Unnamed: 0,index,latitude,longitude,bikes_available,distance_x,rating,review_count,price,distance_y
0,0,43.67,-79.32,13,62,4.5,418,$$,1048.126922
1,1,43.67,-79.32,13,62,4.0,572,$$,1005.190035
2,2,43.67,-79.32,13,62,3.5,503,$$,374.238073
3,3,43.67,-79.32,13,62,4.5,169,$,1184.032903
4,4,43.67,-79.32,13,62,4.5,209,$$,1854.179445


In [67]:
# Convert the strings into numbers
df.price = df.price.map({'$': 0, '$$': 1, '$$$': 2, '$$$$': 3})

# Reshaping into a 2D array with a single feature (column)
price = df.price.values.reshape(-1, 1)

# Initializing the OneHotEncoder
encoder = OneHotEncoder()
encoder.fit(price)

# Transforming the input data into one-hot-encoded format
one_hot_encoded = encoder.transform(price).toarray()

In [68]:
df = df.fillna(0)

Provide model output and an interpretation of the results. 

In [69]:
# Split the data into features (X) and target (y)
X = df.drop("bikes_available", axis=1)
y = df["bikes_available"]

In [70]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Llinear regression model to the training data
reg = LinearRegression().fit(X_train, y_train)

# Predict the target values
y_pred = reg.predict(X_test)

# Evaluate the model performance
print("R-squared:", reg.score(X_test, y_test))

R-squared: 0.10913283510776384


In [71]:
# Using eli5 library to explain model 
expl = eli5.explain_prediction(reg, X_test.iloc[0, :])
print(expl)

Explanation(estimator='LinearRegression()', description=None, error=None, method='linear model', is_regression=True, targets=[TargetExplanation(target='y', feature_weights=FeatureWeights(pos=[FeatureWeight(feature='<BIAS>', weight=13596.109312811985, std=None, value=1.0), FeatureWeight(feature='distance_x', weight=0.8840257623314756, std=None, value=64.0), FeatureWeight(feature='distance_y', weight=0.6693466126307749, std=None, value=1089.2349318379986), FeatureWeight(feature='rating', weight=0.10216976809087741, std=None, value=4.0), FeatureWeight(feature='index', weight=0.009697391668579199, std=None, value=11524.0)], neg=[FeatureWeight(feature='latitude', weight=-7204.67674608099, std=None, value=43.66), FeatureWeight(feature='longitude', weight=-6384.713388206156, std=None, value=-79.38), FeatureWeight(feature='review_count', weight=-0.5285507258148527, std=None, value=932.0), FeatureWeight(feature='price', weight=-0.001672125128429923, std=None, value=1.0)], pos_remaining=0, neg_r

# Stretch

How can you turn the regression model into a classification model?