Build a regression model.

In [9]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import eli5

# Connect to the SQLite database
conn = sqlite3.connect("database.sqlite")

# Load the data into a Pandas dataframe
df = pd.read_sql_query("SELECT * FROM merged_data", conn)

# Close the connection to the SQLite database
conn.close()


In [10]:
#drop unwanted columns and populate 0 for empty/NaN columns
df = df.drop(["name_x_x","name_y_x", "name_x_y","name_y_y","bikes_available_y"], axis=1)
df = df.fillna(0)

df.head()

Unnamed: 0,index,Unnamed: 0_x_x,latitude,longitude,bikes_available_x,Unnamed: 0_y_x,rating_x,review_count_x,price_x,distance_x,Unnamed: 0_x_y,Unnamed: 0_y_y,rating_y,review_count_y,price_y,distance_y
0,0,89,43.67,-79.41,4,0,0,0,0,114,89,0,0,0,0,114
1,1,89,43.67,-79.41,4,0,0,0,0,114,89,1,0,0,0,114
2,2,89,43.67,-79.41,4,0,0,0,0,114,89,2,0,0,0,114
3,3,89,43.67,-79.41,4,0,0,0,0,114,89,3,0,0,0,114
4,4,89,43.67,-79.41,4,0,0,0,0,114,89,4,0,0,0,114


Provide model output and an interpretation of the results. 

In [11]:
# Split the data into features (X) and target (y)
X = df.drop("bikes_available_x", axis=1)
y = df["bikes_available_x"]

In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Llinear regression model to the training data
reg = LinearRegression().fit(X_train, y_train)

# Predict the target values
y_pred = reg.predict(X_test)

# Evaluate the model performance
print("R-squared:", reg.score(X_test, y_test))

R-squared: 0.798264614847305


In [16]:
# Using eli5 library to explain model 
expl = eli5.explain_prediction(reg, X_test.iloc[0, :])
print(expl)

Explanation(estimator='LinearRegression()', description=None, error=None, method='linear model', is_regression=True, targets=[TargetExplanation(target='y', feature_weights=FeatureWeights(pos=[FeatureWeight(feature='index', weight=18.02440591040393, std=None, value=8870.0), FeatureWeight(feature='<BIAS>', weight=15.62044467671716, std=None, value=1.0), FeatureWeight(feature='longitude', weight=1.2122386428004006e-14, std=None, value=-79.41), FeatureWeight(feature='latitude', weight=1.2120859871345147e-15, std=None, value=43.67)], neg=[FeatureWeight(feature='Unnamed: 0_x_x', weight=-20.054495956137078, std=None, value=178.0), FeatureWeight(feature='Unnamed: 0_y_x', weight=-3.190331967955746, std=None, value=13.0), FeatureWeight(feature='Unnamed: 0_x_y', weight=-0.21165379767798567, std=None, value=332.0), FeatureWeight(feature='Unnamed: 0_y_y', weight=-0.02961239184447026, std=None, value=10.0)], pos_remaining=0, neg_remaining=0), proba=None, score=10.158756473505825, weighted_spans=None

# Stretch

How can you turn the regression model into a classification model?