In [77]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

In [78]:
df = pd.read_csv('tips.csv')
df.head()

In [79]:
df.describe().T

In [80]:
df.isna().sum().sort_values(ascending=False)

In [81]:
X = df.drop(['tip'], axis=1)
y = df['tip']

In [82]:
df.nunique()

In [83]:
df.dtypes

In [84]:
sns.heatmap(df.corr(), cmap='Blues', annot=True)
plt.show()

In [85]:
plt.scatter(df['total_bill'], df['tip'], s=5)
plt.title('Total Bill vs. Tip')
plt.show()

In [86]:
plt.hist(df['total_bill'])
plt.hist(df['tip'])

plt.legend(['Total Bill','Tip'])
plt.show()

In [87]:
categorical = ['sex','smoker','day','time']

X_num = X.drop(categorical, axis=1)
X_cat = X.filter(categorical)

X_cat = pd.get_dummies(X_cat, drop_first=True)
X_cat.head()

In [88]:
X = pd.concat([X_num, X_cat], axis=1)
X

In [89]:
X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=0.2, random_state=44)

## Linear Regression

In [90]:
lr = LinearRegression()
regressor_lr = lr.fit(X_test, y_test)
y_pred = regressor_lr.predict(X_test)

In [91]:
MAE_lr = mean_absolute_error(y_test, y_pred)
MSE_lr = mean_squared_error(y_test, y_pred)
RMSE_lr = np.sqrt(MSE_lr)

print(f'Mean Absolute Error: {MAE_lr}')
print(f'Mean Squared Error: {MSE_lr}')
print(f'Root Mean Squared Error: {RMSE_lr}')

## KNN Regression

In [92]:
MAE_lst = []
MSE_lst = []
RMSE_lst = []
neighbors = range(1,10)

for i in neighbors:
    knn = KNeighborsRegressor(n_neighbors=i)
    regressor = knn.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    
    MAE_lst.append(mean_absolute_error(y_test, y_pred))
    MSE_lst.append(mean_squared_error(y_test, y_pred))
    RMSE_lst.append(np.sqrt(mean_squared_error(y_test, y_pred)))

d = {'Neighbors': neighbors, 'MAE': MAE_lst, 'MSE': MSE_lst, 'RMSE': RMSE_lst}

errors = pd.DataFrame(data=d)

In [93]:
plt.plot(errors['MAE'])

In [94]:
knn = KNeighborsRegressor(n_neighbors=6)
regressor_knn = knn.fit(X_train, y_train)
y_pred = regressor_knn.predict(X_test)

In [95]:
MAE_knn = mean_absolute_error(y_test, y_pred)
MSE_knn = mean_squared_error(y_test, y_pred)
RMSE_knn = np.sqrt(MSE_knn)

print(f'Mean Absolute Error: {MAE_knn}')
print(f'Mean Squared Error: {MSE_knn}')
print(f'Root Mean Squared Error: {RMSE_knn}')

## Random Forest Regression

In [96]:
rf = RandomForestRegressor(n_estimators=1_000, random_state=44)
regressor_rf = rf.fit(X_train, y_train)
y_pred = regressor_rf.predict(X_test)

In [97]:
MAE_rf = mean_absolute_error(y_test, y_pred)
MSE_rf = mean_squared_error(y_test, y_pred)
RMSE_rf = np.sqrt(MSE_rf)

print(f'Mean Absolute Error: {MAE_rf}')
print(f'Mean Squared Error: {MSE_rf}')
print(f'Root Mean Squared Error: {RMSE_rf}')

In [98]:
labels = ['Linear Regression', 'KNN Regression', 'Random Forest Regression']
legend = ['Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error']
MAEs = [MAE_lr, MAE_knn, MAE_rf]
MSEs = [MSE_lr, MSE_knn, MSE_rf]
RMSEs = [RMSE_lr, RMSE_knn, RMSE_rf]
x = np.arange(3)
width = 0.2

In [99]:
plt.bar(x-0.2, MAEs, width, color='cyan')
plt.bar(x, MSEs, width, color='orange')
plt.bar(x+0.2, RMSEs, width, color='green')
plt.legend(legend)
plt.xlabel(labels)
plt.show()

In [100]:
test_entry = df.loc[100]
test_entry

In [101]:
test_entry_formatted = X.values[100].reshape(1,-1)

## Tip Prediction

### Linear Regression

In [102]:
lr_pred_tip = regressor_lr.predict(test_entry_formatted)

print(f'Actual Tip: {test_entry[1]}')
print(f'Predicted Tip: {round(lr_pred_tip[0],2)}')

### KNN Regression

In [103]:
knn_pred_tip = regressor_knn.predict(test_entry_formatted)

print(f'Actual Tip: {test_entry[1]}')
print(f'Predicted Tip: {round(knn_pred_tip[0],2)}')

### Random Forest Regression

In [104]:
rf_pred_tip = regressor_rf.predict(test_entry_formatted)

print(f'Actual Tip: {test_entry[1]}')
print(f'Predicted Tip: {round(rf_pred_tip[0],2)}')

In [105]:
labels.append('Actual Tip')
values = [lr_pred_tip, knn_pred_tip, rf_pred_tip, test_entry[1]]
plt.bar(labels, values)
plt.xticks(rotation=45)
plt.title('Comparison of predicted tips by model')
plt.show()