# Revenue Forecasting (Regularized Linear Regression)

## Load libraries and data

In [1]:
import io
import requests
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import (LinearRegression, Ridge,
                                  Lasso, ElasticNet)
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Make this notebook's output stable across runs
np.random.seed(100)

# Plot formatting
%matplotlib inline
sns.set()
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Do not use scientific notation in pandas tables
pd.options.display.float_format = '{:,.4f}'.format

### Data Description

When retailers close stores, they usually conduct going-out-of-business (GOB) sales.

In the data set, each row represents a store's going out of business sale.

| Attribute             | Definition                                                                |
| --------------------- | ------------------------------------------------------------------------- |
| StoreType             | Store's retail segment (bookstores or household goods)                    |
| LiquidationRevenue    | Revenue from the GOB sale                                                 |
| InventoryAtTicket     | Retail value of inventory held at the start of the GOB sale               |
| LastYearRevenue       | Store's revenue over the GOB sale dates during the prior year             |                      
| MedianHouseholdIncome | Median household income in the store's ZIP code (from U.S. Census Bureau) |
| MedianAge             | Median age in the store's ZIP code (from U.S. Census Bureau)              |


In [2]:
url = 'https://raw.githubusercontent.com/natecraig/aiml/main/Data/closings.csv'
download = requests.get(url).content
df = pd.read_csv(io.StringIO(download.decode('utf-8')))
df.head(-5)

Unnamed: 0,StoreType,LiquidationRevenue,InventoryAtTicket,LastYearRevenue,MedianHouseholdIncome,MedianAge
0,Bookstore,1214776.2000,2195069.8000,465237.3000,51290,37.8000
1,Bookstore,1811896.9000,3152301.9000,897704.5000,46702,33.4000
2,Bookstore,1157614.7000,2229837.5000,477804.8000,86804,41.7000
3,Bookstore,2037136.2000,3857466.8000,1036097.7000,83544,40.3000
4,Bookstore,1326821.1000,2345263.8000,612793.5000,23508,31.9000
...,...,...,...,...,...,...
553,Household Goods,1823324.8000,2959365.5300,936848.3300,56052,32.1000
554,Household Goods,1701298.5000,2244323.7300,1361859.4000,42424,35.8000
555,Household Goods,1335948.9000,2921797.7800,699678.4300,46227,30.4000
556,Household Goods,1923520.5000,2516588.0300,1238979.5000,55243,34.7000


## Model

In [3]:
# Transform features
df['Bookstore'] = df['StoreType'].apply(lambda x: 1 if x == 'Bookstore' else 0)
df['BookstoreXInventoryAtTicket'] = df['Bookstore'] * df['InventoryAtTicket']

# Add square terms
numvars = ['InventoryAtTicket', 'MedianHouseholdIncome']
for v in numvars:
    df[v + 'Sq'] = df[v]**2
    
# Set the target, y, and features, X
y = df['LiquidationRevenue']

feature_names = ['Bookstore', 'BookstoreXInventoryAtTicket',
                 'InventoryAtTicket', 'InventoryAtTicketSq',
                 'MedianHouseholdIncome', 'MedianHouseholdIncomeSq']
X = df[feature_names]

# Scale data
scaler = StandardScaler()
X_scale = scaler.fit_transform(X)

# Split data into training and test sets

# Note that we are using a large test size (75%) to emphasize the
# challenge of generalization
(X_train, X_test,
 y_train, y_test) = train_test_split(X_scale, y, test_size=0.75)

In [4]:
# Fit a linear regresion
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_train_pred = linreg.predict(X_train)
y_pred = linreg.predict(X_test)

In [5]:
# Retain coefficients and performance
df_regs = pd.DataFrame(columns=(['Intercept'] + feature_names
                                + ['Train R2', 'Test R2']))
df_regs.loc['Linear'] = ([linreg.intercept_] + list(linreg.coef_)
                         + [r2_score(y_train, y_train_pred),
                            r2_score(y_test, y_pred)])

In [6]:
# Set regularization parameter for ridge, lasso, and elasticnet
alpha=2
l1_ratio=0.5

## Ridge Regression

In [7]:
ridge = Ridge(alpha=alpha)
ridge.fit(X_train, y_train)
y_train_pred = ridge.predict(X_train)
y_pred = ridge.predict(X_test)

df_regs.loc['Ridge'] = ([ridge.intercept_] + list(ridge.coef_)
                         + [r2_score(y_train, y_train_pred),
                            r2_score(y_test, y_pred)])

## LASSO Regression

In [8]:
lasso = Lasso(alpha=alpha, max_iter=100000)
lasso.fit(X_train, y_train)
y_train_pred = lasso.predict(X_train)
y_pred = lasso.predict(X_test)

df_regs.loc['Lasso'] = ([lasso.intercept_] + list(lasso.coef_)
                         + [r2_score(y_train, y_train_pred),
                            r2_score(y_test, y_pred)])

## Elastic Net Regression

In [9]:
elastic = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
elastic.fit(X_train, y_train)
y_train_pred = elastic.predict(X_train)
y_pred = elastic.predict(X_test)

df_regs.loc['ElasticNet'] = ([elastic.intercept_] + list(elastic.coef_)
                             + [r2_score(y_train, y_train_pred),
                                r2_score(y_test, y_pred)])

## Compare Regularization Methods

In [10]:
df_regs

Unnamed: 0,Intercept,Bookstore,BookstoreXInventoryAtTicket,InventoryAtTicket,InventoryAtTicketSq,MedianHouseholdIncome,MedianHouseholdIncomeSq,Train R2,Test R2
Linear,1756243.7299,518455.671,-657002.4275,1092316.5678,-565560.1025,234845.9518,-260800.7061,0.835,0.772
Ridge,1772018.0259,-94260.0153,-140666.8102,266904.5789,124329.4978,119850.1682,-126339.8095,0.8278,0.8222
Lasso,1756280.5748,516904.7844,-655692.23,1090257.663,-563842.8231,234709.4959,-260635.5717,0.835,0.7722
ElasticNet,1764730.2758,-118528.4487,-96173.1549,131436.7686,121057.4608,13813.823,235.8409,0.7658,0.7498


## Exercise

Try different values for alpha and l1_ratio. How does Test R2 change?