<a href="https://colab.research.google.com/github/torchflash/MMAI831-AIOS-Assignment/blob/main/Tony/AIOS_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1.Data Upload**

In [47]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import statsmodels.api as sm
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
import lightgbm as lgb


In [22]:
csv_url = 'https://drive.google.com/uc?export=download&id=1GPlcEcdy3e6KtS5eiH4lhPV1OjCNwT8_'
df = pd.read_csv(csv_url)
df = df.drop(df.columns[0], axis=1)

In [23]:
seed = 831

In [24]:
sales_train, sales_test = train_test_split(df, test_size=0.25, random_state=seed)

In [25]:
sales_train

Unnamed: 0,store,billboard,printout,sat,comp,price,sales
247,2101,1051,795,74,907,92,20850.424070
291,2045,1039,1131,69,918,91,20335.752820
220,2223,1274,979,67,938,95,22065.397050
840,1593,910,728,67,1022,103,11253.624270
32,1822,704,606,67,594,98,12991.913740
...,...,...,...,...,...,...,...
547,2048,810,792,68,919,108,14456.275280
737,1925,1576,926,64,471,96,23297.637780
9,1777,1111,340,72,758,108,15947.783620
659,2127,219,773,70,1004,98,8936.227514


In [26]:
sales_test

Unnamed: 0,store,billboard,printout,sat,comp,price,sales
692,2142,1032,579,69,901,103,18446.99213
805,2194,953,914,68,719,107,16403.58639
627,1905,1185,851,68,705,100,20317.30926
387,2078,1437,622,65,943,94,25472.98324
51,1854,1122,1236,64,845,98,16533.62366
...,...,...,...,...,...,...,...
716,1334,881,967,73,1024,96,12450.35271
724,2030,799,846,72,544,96,17491.66005
639,2316,867,1415,76,706,104,20622.98974
175,1790,1255,571,64,715,100,16548.68130


In [27]:
sales_train['store&sat'] = sales_train['store'] * sales_train['sat']
sales_train['price&sat'] = sales_train['price'] * sales_train['sat']
sales_train['store&price'] = sales_train['store'] * sales_train['price']


In [28]:
sales_train

Unnamed: 0,store,billboard,printout,sat,comp,price,sales,store&sat,price&sat,store&price
247,2101,1051,795,74,907,92,20850.424070,155474,6808,193292
291,2045,1039,1131,69,918,91,20335.752820,141105,6279,186095
220,2223,1274,979,67,938,95,22065.397050,148941,6365,211185
840,1593,910,728,67,1022,103,11253.624270,106731,6901,164079
32,1822,704,606,67,594,98,12991.913740,122074,6566,178556
...,...,...,...,...,...,...,...,...,...,...
547,2048,810,792,68,919,108,14456.275280,139264,7344,221184
737,1925,1576,926,64,471,96,23297.637780,123200,6144,184800
9,1777,1111,340,72,758,108,15947.783620,127944,7776,191916
659,2127,219,773,70,1004,98,8936.227514,148890,6860,208446


In [29]:
X_train = sales_train[['store', 'billboard', 'printout', 'sat', 'comp', 'price','store&sat','price&sat','store&price']]
y_train = sales_train['sales']
X_train = sm.add_constant(X_train)

In [30]:
sales_test['store&sat'] = sales_test['store'] * sales_test['sat']
sales_test['price&sat'] = sales_test['price'] * sales_test['sat']
sales_test['store&price'] = sales_test['store'] * sales_test['price']

# **1.Linear Regression**

In [31]:
model1 = sm.OLS(y_train, X_train)

In [32]:
results1 = model1.fit()

In [33]:
X_test = sales_test[['store', 'billboard', 'printout', 'sat', 'comp', 'price','store&sat','price&sat','store&price']]
y_test = sales_test['sales']
X_test = sm.add_constant(X_test)

In [34]:
y_pred = results1.predict(X_test)

In [35]:
mse = mean_squared_error(y_test, y_pred)
r2 = results1.rsquared
mapd = mean_absolute_percentage_error(y_test, y_pred) * 100

# Calculate AIC and BIC
n = len(y_train)  # Number of samples
k = X_train.shape[1]  # Number of features + 1 (including the constant)
llf = results1.llf  # Log-likelihood of the model

aic = -2 * llf + 2 * k  # AIC
bic = -2 * llf + np.log(n) * k  # BIC

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")
print(f"Mean Absolute Percentage Difference (MAPD): {mapd}")
print(f"AIC: {aic}")
print(f"BIC: {bic}")

Mean Squared Error (MSE): 1165544.2746332064
R-squared (R2): 0.9188468924312454
Mean Absolute Percentage Difference (MAPD): 5.169938056984556
AIC: 12544.289452535419
BIC: 12590.490184600721


# **2.Random Forest**

In [36]:
model2 = RandomForestRegressor()

# Train the model
model2.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model2.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mapd = mean_absolute_percentage_error(y_test, y_pred) * 100

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")
print(f"Mean Absolute Percentage Difference (MAPD): {mapd}")

Mean Squared Error (MSE): 1763954.7358419453
R-squared (R2): 0.8844379089048081
Mean Absolute Percentage Difference (MAPD): 6.2076091362936205


# **3.SVM**

In [37]:
model3 = SVR()

# Train the model
model3.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model3.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mapd = mean_absolute_percentage_error(y_test, y_pred) * 100

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")
print(f"Mean Absolute Percentage Difference (MAPD): {mapd}")

Mean Squared Error (MSE): 15469806.648235802
R-squared (R2): -0.013474534682513362
Mean Absolute Percentage Difference (MAPD): 18.5553118259976


# **4.Ridge**

In [38]:
model4 = Ridge()

# Train the model
model4.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model4.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mapd = mean_absolute_percentage_error(y_test, y_pred) * 100

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")
print(f"Mean Absolute Percentage Difference (MAPD): {mapd}")

Mean Squared Error (MSE): 1165459.3755836473
R-squared (R2): 0.9236471776784805
Mean Absolute Percentage Difference (MAPD): 5.169856425173428


# **5.Lasso**

In [43]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a Lasso regression model
model5 = Lasso(alpha=0.1)  # Adjust the value of alpha as needed

# Train the model
model5.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model5.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mapd = mean_absolute_percentage_error(y_test, y_pred) * 100

# Calculate AIC and BIC
n = len(y_test)
p = len(X_test.columns)
rss = np.sum((y_test - y_pred) ** 2)
aic = n * np.log(rss/n) + 2 * p
bic = n * np.log(rss/n) + p * np.log(n)

print(f"AIC: {aic}")
print(f"BIC: {bic}")
print(f"R-squared (R2): {r2}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Percentage Difference (MAPD): {mapd}")

AIC: 3512.01301020822
BIC: 3547.2276193868424
R-squared (R2): 0.9236909794310645
Mean Squared Error (MSE): 1164790.7799553012
Mean Absolute Percentage Difference (MAPD): 5.17165042175985


  model = cd_fast.enet_coordinate_descent(


# **6.KNN**

In [45]:
model6 = KNeighborsRegressor(n_neighbors=5)  # Adjust the value of n_neighbors as needed

# Train the model
model6.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model6.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mapd = mean_absolute_percentage_error(y_test, y_pred) * 100

print(f"R-squared (R2): {r2}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Percentage Difference (MAPD): {mapd}")

R-squared (R2): 0.2818406092686023
Mean Squared Error (MSE): 10962078.016799763
Mean Absolute Percentage Difference (MAPD): 15.869091240288535


# **7.LightGBM**

In [48]:
# Create a LightGBM regression model
model7 = lgb.LGBMRegressor()

# Train the model
model7.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model7.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mapd = mean_absolute_percentage_error(y_test, y_pred) * 100

print(f"R-squared (R2): {r2}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Percentage Difference (MAPD): {mapd}")

R-squared (R2): 0.8904022818285557
Mean Squared Error (MSE): 1672913.7745243988
Mean Absolute Percentage Difference (MAPD): 5.830810487373542
