Build a regression model.

In [23]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import eli5
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error

In [24]:
# Connect to the SQLite database
conn = sqlite3.connect("database.sqlite")

# Load the data into a Pandas dataframe
df = pd.read_sql_query("SELECT * FROM merged_data", conn)

# Close the connection to the SQLite database
conn.close()

In [16]:
# Data Types
print(df.dtypes)

# Column Names
print(df.columns)

index                           int64
Unnamed: 0_x                    int64
name_x                         object
latitude                      float64
longitude                     float64
bikes_available                 int64
Unnamed: 0_y                    int64
fsq_id                         object
category_id                     int64
category_name                  object
chains                         object
distance_x                      int64
location_country               object
location_cross_street          object
location_formatted_address     object
location_locality              object
location_postcode              object
location_region                object
location_timezone              object
name_y                         object
Unnamed: 0                      int64
name                           object
rating                        float64
review_count                    int64
price                          object
distance_y                    float64
category    

In [25]:
df = df.drop(["Unnamed: 0_x","name_x", "Unnamed: 0_y","fsq_id","category_id","location_postcode","location_region","location_timezone",
              "name_y","Unnamed: 0","name","chains","location_country","location_cross_street","location_formatted_address","location_locality","category_name"], axis=1)

df.head()

Unnamed: 0,index,latitude,longitude,bikes_available,distance_x,rating,review_count,price,distance_y,category
0,0,43.67,-79.32,13,62,4.5,418,$$,1048.126922,
1,1,43.67,-79.32,13,62,4.0,572,$$,1005.190035,
2,2,43.67,-79.32,13,62,3.5,503,$$,374.238073,
3,3,43.67,-79.32,13,62,4.5,169,$,1184.032903,
4,4,43.67,-79.32,13,62,4.5,209,$$,1854.179445,


In [26]:
# Convert the strings into numbers
df.price = df.price.map({'$': 0, '$$': 1, '$$$': 2, '$$$$': 3})

# Reshaping into a 2D array with a single feature (column)
price = df.price.values.reshape(-1, 1)

# Initializing the OneHotEncoder
encoder = OneHotEncoder()
encoder.fit(price)

# Transforming the input data into one-hot-encoded format
one_hot_encoded = encoder.transform(price).toarray()

In [27]:
df = df.fillna(0)

Provide model output and an interpretation of the results. 

In [28]:
# Split the data into features (X) and target (y)
X = df.drop("bikes_available", axis=1)
y = df["bikes_available"]

In [29]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Llinear regression model to the training data
reg = LinearRegression().fit(X_train, y_train)

# Predict the target values
y_pred = reg.predict(X_test)

# Evaluate the model performance
print("R-squared:", reg.score(X_test, y_test))

R-squared: 0.1270297081328129


In [30]:
# Using eli5 library to explain model 
expl = eli5.explain_prediction(reg, X_test.iloc[0, :])
print(expl)

Explanation(estimator='LinearRegression()', description=None, error=None, method='linear model', is_regression=True, targets=[TargetExplanation(target='y', feature_weights=FeatureWeights(pos=[FeatureWeight(feature='<BIAS>', weight=15750.102860924335, std=None, value=1.0), FeatureWeight(feature='distance_y', weight=1.1278146365808925, std=None, value=1327.5121317319397), FeatureWeight(feature='index', weight=0.6117121443477826, std=None, value=27078.0), FeatureWeight(feature='distance_x', weight=0.43192222049174234, std=None, value=64.0)], neg=[FeatureWeight(feature='latitude', weight=-9073.688910216862, std=None, value=43.66), FeatureWeight(feature='longitude', weight=-6669.5199797635805, std=None, value=-79.39), FeatureWeight(feature='rating', weight=-0.9651904078133171, std=None, value=4.0), FeatureWeight(feature='review_count', weight=-0.5475267064278455, std=None, value=735.0), FeatureWeight(feature='price', weight=-0.038470855124138253, std=None, value=3.0)], pos_remaining=0, neg_

# Stretch

How can you turn the regression model into a classification model?