In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# loading the dataset
df = pd.read_excel('../data/extra_dtaset.xlsx')
df.head()

In [None]:
# splitting the dataset into train and test
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# instantiating the model
linreg = LinearRegression()

# training the model
linreg.fit(train[['bogo ', 'chicken ', 'guest', 'breakfast_prop']], train['lunch_prop'])

# predicting the lunch proportion
train['lunch_prop_pred'] = linreg.predict(train[['bogo ', 'chicken ', 'guest', 'breakfast_prop']])
test['lunch_prop_pred'] = linreg.predict(test[['bogo ', 'chicken ', 'guest', 'breakfast_prop']])
print(np.sqrt(mean_squared_error(train['lunch_prop'], train['lunch_prop_pred'])), np.sqrt(mean_squared_error(test['lunch_prop'], test['lunch_prop_pred'])))
print(r2_score(train['lunch_prop'], train['lunch_prop_pred']), r2_score(test['lunch_prop'], test['lunch_prop_pred']))

In [None]:
linreg = LinearRegression()

# training the model
linreg.fit(train[['bogo ', 'chicken ', 'guest', 'lunch_prop']], train['dinner_prop'])

# predicting the dinner proportion
train['dinner_prop_pred'] = linreg.predict(train[['bogo ', 'chicken ', 'guest', 'lunch_prop']])
test['dinner_prop_pred'] = linreg.predict(test[['bogo ', 'chicken ', 'guest', 'lunch_prop']])
print(np.sqrt(mean_squared_error(train['dinner_prop'], train['dinner_prop_pred'])), np.sqrt(mean_squared_error(test['dinner_prop'], test['dinner_prop_pred'])))
print(r2_score(train['dinner_prop'], train['dinner_prop_pred']), r2_score(test['dinner_prop'], test['dinner_prop_pred']))

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso, ElasticNet

# instantiating the models
dt = DecisionTreeRegressor()
rf = RandomForestRegressor()
knn = KNeighborsRegressor()
svr = SVR()
ridge = Ridge()
lasso = Lasso()
en = ElasticNet()

# training the models
dt.fit(train[['bogo ', 'chicken ', 'guest', 'breakfast_prop']], train['lunch_prop'])
rf.fit(train[['bogo ', 'chicken ', 'guest', 'breakfast_prop']], train['lunch_prop'])
knn.fit(train[['bogo ', 'chicken ', 'guest', 'breakfast_prop']], train['lunch_prop'])
svr.fit(train[['bogo ', 'chicken ', 'guest', 'breakfast_prop']], train['lunch_prop'])
ridge.fit(train[['bogo ', 'chicken ', 'guest', 'breakfast_prop']], train['lunch_prop'])
lasso.fit(train[['bogo ', 'chicken ', 'guest', 'breakfast_prop']], train['lunch_prop'])
en.fit(train[['bogo ', 'chicken ', 'guest', 'breakfast_prop']], train['lunch_prop'])

# predicting the lunch proportion
train['lunch_prop_pred_dt'] = dt.predict(train[['bogo ', 'chicken ', 'guest', 'breakfast_prop']])
test['lunch_prop_pred_dt'] = dt.predict(test[['bogo ', 'chicken ', 'guest', 'breakfast_prop']])
train['lunch_prop_pred_rf'] = rf.predict(train[['bogo ', 'chicken ', 'guest', 'breakfast_prop']])
test['lunch_prop_pred_rf'] = rf.predict(test[['bogo ', 'chicken ', 'guest', 'breakfast_prop']])
train['lunch_prop_pred_knn'] = knn.predict(train[['bogo ', 'chicken ', 'guest', 'breakfast_prop']])
test['lunch_prop_pred_knn'] = knn.predict(test[['bogo ', 'chicken ', 'guest', 'breakfast_prop']])
train['lunch_prop_pred_svr'] = svr.predict(train[['bogo ', 'chicken ', 'guest', 'breakfast_prop']])
test['lunch_prop_pred_svr'] = svr.predict(test[['bogo ', 'chicken ', 'guest', 'breakfast_prop']])
train['lunch_prop_pred_ridge'] = ridge.predict(train[['bogo ', 'chicken ', 'guest', 'breakfast_prop']])
test['lunch_prop_pred_ridge'] = ridge.predict(test[['bogo ', 'chicken ', 'guest', 'breakfast_prop']])
train['lunch_prop_pred_lasso'] = lasso.predict(train[['bogo ', 'chicken ', 'guest', 'breakfast_prop']])
test['lunch_prop_pred_lasso'] = lasso.predict(test[['bogo ', 'chicken ', 'guest', 'breakfast_prop']])
train['lunch_prop_pred_en'] = en.predict(train[['bogo ', 'chicken ', 'guest', 'breakfast_prop']])
test['lunch_prop_pred_en'] = en.predict(test[['bogo ', 'chicken ', 'guest', 'breakfast_prop']])


# plotting the train and test RMSE
# plt.figure(figsize=(20, 10))
plt.plot(np.sqrt(mean_squared_error(train['lunch_prop'], train['lunch_prop_pred_dt'])), np.sqrt(mean_squared_error(test['lunch_prop'], test['lunch_prop_pred_dt'])), 'bo', label='Decision Tree')
plt.plot(np.sqrt(mean_squared_error(train['lunch_prop'], train['lunch_prop_pred_rf'])), np.sqrt(mean_squared_error(test['lunch_prop'], test['lunch_prop_pred_rf'])), 'go', label='Random Forest')
plt.plot(np.sqrt(mean_squared_error(train['lunch_prop'], train['lunch_prop_pred_knn'])), np.sqrt(mean_squared_error(test['lunch_prop'], test['lunch_prop_pred_knn'])), 'ro', label='KNN')
plt.plot(np.sqrt(mean_squared_error(train['lunch_prop'], train['lunch_prop_pred_svr'])), np.sqrt(mean_squared_error(test['lunch_prop'], test['lunch_prop_pred_svr'])), 'co', label='SVR')
plt.plot(np.sqrt(mean_squared_error(train['lunch_prop'], train['lunch_prop_pred_ridge'])), np.sqrt(mean_squared_error(test['lunch_prop'], test['lunch_prop_pred_ridge'])), 'mo', label='Ridge')
plt.plot(np.sqrt(mean_squared_error(train['lunch_prop'], train['lunch_prop_pred_lasso'])), np.sqrt(mean_squared_error(test['lunch_prop'], test['lunch_prop_pred_lasso'])), 'yo', label='Lasso')
plt.plot(np.sqrt(mean_squared_error(train['lunch_prop'], train['lunch_prop_pred_en'])), np.sqrt(mean_squared_error(test['lunch_prop'], test['lunch_prop_pred_en'])), 'ko', label='ElasticNet')
plt.xlabel('Train RMSE')
plt.ylabel('Test RMSE')
plt.title('Lunch Proportion')
plt.legend()
plt.show()

In [None]:
from sklearn.utils import resample

threshold = 0.6

train_minority = train[train['lunch_prop'] < threshold]
train_majority = train[train['lunch_prop'] >= threshold]

desired_length = int(len(train_majority)*1.5)
train_minority_upsampled = resample(train_minority, replace=True, n_samples=desired_length, random_state=42)
train_upsampled = pd.concat([train_majority, train_minority_upsampled])
train_upsampled = train_upsampled.sample(frac=1, random_state=42)

In [None]:
dt.fit(train_upsampled[['bogo ', 'chicken ', 'guest', 'breakfast_prop']], train_upsampled['lunch_prop'])
train['lunch_prop_pred_dt'] = dt.predict(train[['bogo ', 'chicken ', 'guest', 'breakfast_prop']])
test['lunch_prop_pred_dt'] = dt.predict(test[['bogo ', 'chicken ', 'guest', 'breakfast_prop']])
print(np.sqrt(mean_squared_error(train['lunch_prop'], train['lunch_prop_pred_dt'])), np.sqrt(mean_squared_error(test['lunch_prop'], test['lunch_prop_pred_dt'])))
print(r2_score(train['lunch_prop'], train['lunch_prop_pred_dt']), r2_score(test['lunch_prop'], test['lunch_prop_pred_dt']))

In [None]:
from sklearn.model_selection import cross_val_score

dt = DecisionTreeRegressor()

# performing 5-fold cross-validation
scores = cross_val_score(dt, train_upsampled[['bogo ', 'chicken ', 'guest', 'breakfast_prop']], train_upsampled['lunch_prop'], cv=5, scoring='neg_mean_squared_error')
scores = -scores
print(scores)