# Restaurants San Antonio

In [1]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# File to Load
file_path = Path("Restaurants_SanAntonio.csv")

# Read Restaurant Data File and store into Pandas DataFrame
restaurants_df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
restaurants_df.head()


Unnamed: 0,id,position,name,score,ratings,category,price_range,lat,lng,address,city,state,zip_code
0,34517,6,Fiesta Liquor #3,3.0,,"African, Ethiopian, Vegetarian, Alcohol, Liquo...",$,29.57432,-98.51521,14415 Blanco Road,San Antonio,TX,78248.0
1,34518,5,Squeezers Juice Bar,5.0,11.0,"Black-owned, Juice and Smoothies, African, Hea...",,29.441,-98.48547,914 East Elmira Street,San Antonio,TX,78212.0
2,34520,76,Spinster Sisters Co. (427 Lombrano St),,,"Home &amp; Personal Care, Everyday Essentials,...",$,29.44191,-98.51055,427 Lombrano St,San Antonio,TX,78207.0
3,34521,194,Yaya's Thai Fusion,,,"Thai, Asian, Noodles",$,29.480652,-98.607908,5819 Northwest Loop 410,San Antonio,TX,78238.0
4,34522,192,Willie's Grill &amp; Icehouse (7911 Interstate),4.5,92.0,"American, Burgers, Family Friendly, Wings, Alc...",$,29.350066,-98.537672,7911 Interstate 35 Access Rd,San Antonio,TX,78224.0


In [2]:
restaurants_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1215 entries, 0 to 1214
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           1215 non-null   int64  
 1   position     1215 non-null   int64  
 2   name         1215 non-null   object 
 3   score        672 non-null    float64
 4   ratings      671 non-null    float64
 5   category     1214 non-null   object 
 6   price_range  955 non-null    object 
 7   lat          1215 non-null   float64
 8   lng          1215 non-null   float64
 9   address      1215 non-null   object 
 10  city         1215 non-null   object 
 11  state        1215 non-null   object 
 12  zip_code     1214 non-null   float64
dtypes: float64(5), int64(2), object(6)
memory usage: 123.5+ KB


In [3]:
# Check which columns have NaN values and how many NaN values are present in each column
nan_counts = restaurants_df.isnull().sum()

# Display the count of NaN values in each column
print(nan_counts)

id               0
position         0
name             0
score          543
ratings        544
category         1
price_range    260
lat              0
lng              0
address          0
city             0
state            0
zip_code         1
dtype: int64


In [4]:
# Removing rows with NaN values and storing the result in a new DataFrame
restaurants_df_cleaned = restaurants_df.dropna()

# Display the first few rows of the cleaned DataFrame
restaurants_df_cleaned.head()

Unnamed: 0,id,position,name,score,ratings,category,price_range,lat,lng,address,city,state,zip_code
4,34522,192,Willie's Grill &amp; Icehouse (7911 Interstate),4.5,92.0,"American, Burgers, Family Friendly, Wings, Alc...",$,29.350066,-98.537672,7911 Interstate 35 Access Rd,San Antonio,TX,78224.0
8,34527,190,McDonald's¬Æ (1643 Pleasanton Road),4.2,167.0,"American, Fast Food, Burgers",$,29.366126,-98.504588,1643 Pleasanton Rd,San Antonio,TX,78221.0
25,34547,215,The Philly Cheesesteak Company (1011 N Loop 16...,3.9,41.0,"American, Sandwiches, Desserts",$,29.61114,-98.47781,1011 N Loop 1604 E,San Antonio,TX,78232.0
28,34551,171,Zito's Delicatessen &amp; Sandwich Shop (8800 ...,4.8,11.0,"Sandwiches, American, Healthy",$$,29.52026,-98.46073,8800 Broadway St,San Antonio,TX,78217.0
29,34552,160,"Nachos Muchachos (427 Lombrano St,)",3.2,33.0,"American, Mexican, Snacks",$$,29.44191,-98.51055,427 Lombrano St,San Antonio,TX,78207.0


In [5]:
restaurants_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 558 entries, 4 to 1214
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           558 non-null    int64  
 1   position     558 non-null    int64  
 2   name         558 non-null    object 
 3   score        558 non-null    float64
 4   ratings      558 non-null    float64
 5   category     558 non-null    object 
 6   price_range  558 non-null    object 
 7   lat          558 non-null    float64
 8   lng          558 non-null    float64
 9   address      558 non-null    object 
 10  city         558 non-null    object 
 11  state        558 non-null    object 
 12  zip_code     558 non-null    float64
dtypes: float64(5), int64(2), object(6)
memory usage: 61.0+ KB


In [7]:
#Saving the file
restaurants_df_cleaned.to_csv("Output/restaurants_df.csv", index=False, header=True)

## Data Preparation

In [9]:
#Make a copy of the DF to edit for Linear Regression
clean_copy = restaurants_df_cleaned.copy()
clean_copy.head()

Unnamed: 0,id,position,name,score,ratings,category,price_range,lat,lng,address,city,state,zip_code
4,34522,192,Willie's Grill &amp; Icehouse (7911 Interstate),4.5,92.0,"American, Burgers, Family Friendly, Wings, Alc...",$,29.350066,-98.537672,7911 Interstate 35 Access Rd,San Antonio,TX,78224.0
8,34527,190,McDonald's¬Æ (1643 Pleasanton Road),4.2,167.0,"American, Fast Food, Burgers",$,29.366126,-98.504588,1643 Pleasanton Rd,San Antonio,TX,78221.0
25,34547,215,The Philly Cheesesteak Company (1011 N Loop 16...,3.9,41.0,"American, Sandwiches, Desserts",$,29.61114,-98.47781,1011 N Loop 1604 E,San Antonio,TX,78232.0
28,34551,171,Zito's Delicatessen &amp; Sandwich Shop (8800 ...,4.8,11.0,"Sandwiches, American, Healthy",$$,29.52026,-98.46073,8800 Broadway St,San Antonio,TX,78217.0
29,34552,160,"Nachos Muchachos (427 Lombrano St,)",3.2,33.0,"American, Mexican, Snacks",$$,29.44191,-98.51055,427 Lombrano St,San Antonio,TX,78207.0


In [None]:
# position (Restaurant position in the search result)--keep
# score (Restaurant score)--going to drop this to predict it
# category (Restaurant category)--Maybe keep?? may mean we sacrifice clustering later on
# price_range (Restaurant price range - $ = Inexpensive, $$ = Moderately expensive, $$$ = Expensive, $$$$ = Very Expensive) - Source - stackoverflow
# zip_code (Zip code)--keep!


#select the 'score', 'category'??,'price_range', 'zip_code'
x=position, category, price_range, zip_code
y=score

In [16]:
#Convert the 'ratings' column from $ to number values
clean_copy['price_range'] = clean_copy['price_range'].replace('$', '1')
clean_copy['price_range'] = clean_copy['price_range'].replace('$$', '2')
clean_copy['price_range'] = clean_copy['price_range'].replace('$$$', '3')
clean_copy['price_range'] = clean_copy['price_range'].replace('$$$$', '4')

In [17]:
clean_copy.head()

Unnamed: 0,id,position,name,score,ratings,category,price_range,lat,lng,address,city,state,zip_code
4,34522,192,Willie's Grill &amp; Icehouse (7911 Interstate),4.5,92.0,"American, Burgers, Family Friendly, Wings, Alc...",1,29.350066,-98.537672,7911 Interstate 35 Access Rd,San Antonio,TX,78224.0
8,34527,190,McDonald's¬Æ (1643 Pleasanton Road),4.2,167.0,"American, Fast Food, Burgers",1,29.366126,-98.504588,1643 Pleasanton Rd,San Antonio,TX,78221.0
25,34547,215,The Philly Cheesesteak Company (1011 N Loop 16...,3.9,41.0,"American, Sandwiches, Desserts",1,29.61114,-98.47781,1011 N Loop 1604 E,San Antonio,TX,78232.0
28,34551,171,Zito's Delicatessen &amp; Sandwich Shop (8800 ...,4.8,11.0,"Sandwiches, American, Healthy",2,29.52026,-98.46073,8800 Broadway St,San Antonio,TX,78217.0
29,34552,160,"Nachos Muchachos (427 Lombrano St,)",3.2,33.0,"American, Mexican, Snacks",2,29.44191,-98.51055,427 Lombrano St,San Antonio,TX,78207.0


In [18]:
# Reformat data of the independent variable X as a single-column array
X = restaurants_df_cleaned["price_range"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[4.5],
       [4.2],
       [3.9],
       [4.8],
       [3.2]])

In [19]:
# The shape of X is 558 samples, with a single feature (column)
X.shape

(558, 1)

In [20]:
# Create an array for the dependent variable y
y = restaurants_df_cleaned["ratings"]

## Building the Linear Regression Model

In [23]:
#Linear Regression to determine the value that might replace the null rating and score values
# Create a model with scikit-learn
model = LinearRegression()

#standardScaler()

# Fit the data into the model
model.fit(X, y)

LinearRegression()

In [24]:
# Display the slope
print(f"Model's slope: {model.coef_}")

# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's slope: [39.26707618]
Model's y-intercept: -94.7311844079377
Model's formula: y = -94.7311844079377 + 39.267076182787264X


In [29]:
# Display the formula to predict the rating for a restaurant based on its price range
print(f"Model's $ formula: y = {model.intercept_} + {model.coef_[0]} * 1")
print(f"Model's $$ formula: y = {model.intercept_} + {model.coef_[0]} * 2")
print(f"Model's $$$ formula: y = {model.intercept_} + {model.coef_[0]} * 3")
print(f"Model's $$$$ formula: y = {model.intercept_} + {model.coef_[0]} * 4")

Model's $ formula: y = -94.7311844079377 + 39.267076182787264 * 1
Model's $$ formula: y = -94.7311844079377 + 39.267076182787264 * 2
Model's $$$ formula: y = -94.7311844079377 + 39.267076182787264 * 3
Model's $$$$ formula: y = -94.7311844079377 + 39.267076182787264 * 4


In [27]:
#Predict the rating for a restaurant
y_1=model.intercept_ + model.coef_[0]*1
y_2=model.intercept_ + model.coef_[0]*2
y_3=model.intercept_ + model.coef_[0]*3
y_4=model.intercept_ + model.coef_[0]*4

# Display the prediction
print(f"Predicted rating for $ restaurant: {y_1:.2f}")
print(f"Predicted rating for $$ restaurant: {y_2:.2f}")
print(f"Predicted rating for $$$ restaurant: {y_3:.2f}")
print(f"Predicted rating for $$$$ restaurant: {y_4:.2f}")

Predicted rating for $ restaurant: -55.46
Predicted rating for $$ restaurant: -16.20
Predicted rating for $$$ restaurant: 23.07
Predicted rating for $$$$ restaurant: 62.34


In [26]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [None]:
# Add a column with the predicted salary values
df_ratings_predicted["salary_predicted"] = predicted_y_values

# Display sample data
df_ratings_predicted.head()

In [None]:
# Create a line plot of the predicted salary values
best_fit_line = df_salary_predicted.hvplot.line(
    x = "years_experience",
    y = "salary_predicted",
    color = "red"
)
best_fit_line

In [None]:
# Superpose the original data and the best fit line
salary_plot * best_fit_line

## Linear Regression Model Assessment

In [None]:
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")