Build a regression model.

In [43]:
import pandas as pd
import sqlite3
from sqlite3 import Error
import statsmodels.api as sm

In [44]:
def create_connection(path):
    connection = None
    try:
        connection = sqlite3.connect(path)
        print("Connection to SQLite DB successful")
    except Error as e:
        print(f"The error '{e}' occurred")

    return connection

In [51]:
connection = create_connection("sm_app.sqlite")

Connection to SQLite DB successful


In [69]:
df = pd.read_sql('SELECT * FROM pois', connection)

In [71]:
values = {'rating': df['rating'].mean(), 'review count': df['review count'].mean()}

In [72]:
df = df.fillna(value=values)

In [73]:
df

Unnamed: 0,index,station name,name,address,category,rating,review count,free_bikes,latitude,longitude
0,0,01. Curtatone,Pizzeria Alle Scalette,Viale Curtatone 18,Pizzeria,4.183333,11.966667,0,43.32160,11.327948
1,1,01. Curtatone,Piazza Giacomo Matteotti,Piazza Giacomo Matteotti,Plaza,4.183333,11.966667,0,43.32160,11.327948
2,2,01. Curtatone,Basilica di San Domenico,Piazza San Domenico,Church,4.183333,11.966667,0,43.32160,11.327948
3,3,01. Curtatone,Caffè La Piazzetta,Via Montanini 52,"Bar, Café, Italian Restaurant",4.183333,11.966667,0,43.32160,11.327948
4,4,01. Curtatone,Consorzio Agrario di Siena,Via Pianigiani Giuseppe 9,Grocery Store,4.183333,11.966667,0,43.32160,11.327948
...,...,...,...,...,...,...,...,...,...,...
226,226,19. Petriccio,Erboristeria La Pimpinella,Via Celso Cittadini 16,"Alternative Medicine Clinic, Drugstore",4.183333,11.966667,0,43.33464,11.304803
227,227,19. Petriccio,Parrocchia Beato Bernardo Tolomei,Via Tolomei Bernardo 1,Church,4.183333,11.966667,0,43.33464,11.304803
228,228,19. Petriccio,Dr. Vigni Farmacia,Via Quinto Settano 1,Drugstore,4.183333,11.966667,0,43.33464,11.304803
229,229,19. Petriccio,Bovicelli Roberto,Via Gallerani 23,Dentist,4.183333,11.966667,0,43.33464,11.304803


In [74]:
X = df['rating']
y = df['free_bikes']

X = sm.add_constant(X)
lin_reg = sm.OLS(y,X)

Provide model output and an interpretation of the results. 

In [75]:
model = lin_reg.fit()
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:             free_bikes   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     1.271
Date:                Sun, 30 Jul 2023   Prob (F-statistic):              0.261
Time:                        12:56:26   Log-Likelihood:                -523.40
No. Observations:                 231   AIC:                             1051.
Df Residuals:                     229   BIC:                             1058.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0835      1.438     -0.058      0.9

Notes
- The R-squared value is 0.000 which means the model is fit
- However the p-value is 0.756 > 0.05 so the rating doesn't have significant impact to the number of bikes

In [76]:
X = df['review count']
y = df['free_bikes']

X = sm.add_constant(X)
lin_reg = sm.OLS(y,X)

In [77]:
model = lin_reg.fit()
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:             free_bikes   R-squared:                       0.036
Model:                            OLS   Adj. R-squared:                  0.032
Method:                 Least Squares   F-statistic:                     8.537
Date:                Sun, 30 Jul 2023   Prob (F-statistic):            0.00383
Time:                        12:56:34   Log-Likelihood:                -519.81
No. Observations:                 231   AIC:                             1044.
Df Residuals:                     229   BIC:                             1051.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            2.2977      0.304      7.559   

Notes
- The R-squared value is 0.000 which means the model is fit
- The P-value is 0.004 which mean review count has significant impact to the number of bikes

# Stretch

How can you turn the regression model into a classification model?