In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt

%matplotlib inline

In [2]:
camry = pd.read_csv('Camry_242_Spring2023.csv')
camry.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174 entries, 0 to 173
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MonthNumeric   174 non-null    int64  
 1   MonthFactor    174 non-null    object 
 2   Year           174 non-null    int64  
 3   CamrySales     174 non-null    int64  
 4   Unemployment   174 non-null    float64
 5   CamryQueries   174 non-null    int64  
 6   CPIAll         174 non-null    float64
 7   CPIEnergy      174 non-null    float64
 8   MilesTraveled  174 non-null    int64  
dtypes: float64(3), int64(5), object(1)
memory usage: 12.4+ KB


In [3]:
camry_train = camry[camry['Year'] <= 2018]
camry_testA = camry[(camry['Year'] >= 2019) & (camry['Year'] <= 2020)]
camry_testB = camry[camry['Year'] >= 2021]

len(camry_train), len(camry_testA), len(camry_testB)

(132, 24, 18)

In [4]:
import statsmodels.formula.api as smf

# Simple regression using new data, not yet incorporating the Winery variable
modOld = smf.ols(formula='CamrySales ~ MonthFactor + Unemployment + CamryQueries + CPIAll + CPIEnergy + MilesTraveled',
                 data=camry_train).fit()
print(modOld.summary())

                            OLS Regression Results                            
Dep. Variable:             CamrySales   R-squared:                       0.511
Model:                            OLS   Adj. R-squared:                  0.443
Method:                 Least Squares   F-statistic:                     7.499
Date:                Tue, 31 Jan 2023   Prob (F-statistic):           9.58e-12
Time:                        13:44:24   Log-Likelihood:                -1303.3
No. Observations:                 132   AIC:                             2641.
Df Residuals:                     115   BIC:                             2690.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept               

In [5]:
import matplotlib.pyplot as plt
def coefplot(results):
    '''
    Takes in results of OLS model and returns a plot of 
    the coefficients with 95% confidence intervals.
    
    Removes intercept, so if uncentered will return error.
    '''
    # Create dataframe of results summary 
    coef_df = pd.DataFrame(results.summary().tables[1].data)
    
    # Add column names
    coef_df.columns = coef_df.iloc[0]

    # Drop the extra row with column labels
    coef_df=coef_df.drop(0)

    # Set index to variable names 
    coef_df = coef_df.set_index(coef_df.columns[0])

    # Change datatype from object to float
    coef_df = coef_df.astype(float)

    # Get errors; (coef - lower bound of conf interval)
    errors = coef_df['coef'] - coef_df['[0.025']
    
    # Append errors column to dataframe
    coef_df['errors'] = errors

    # Drop the constant for plotting
    coef_df = coef_df.drop(['const'])

    # Sort values by coef ascending
    coef_df = coef_df.sort_values(by=['coef'])

    ### Plot Coefficients ###

    # x-labels
    variables = list(coef_df.index.values)
    
    # Add variables column to dataframe
    coef_df['variables'] = variables
    
    # Set sns plot style back to 'poster'
    # This will make bars wide on plot
    sns.set_context("poster")

    # Define figure, axes, and plot
    fig, ax = plt.subplots(figsize=(8, 5))
    
    # Error bars for 95% confidence interval
    # Can increase capsize to add whiskers
    coef_df.plot(x='variables', y='coef', kind='bar',
                 ax=ax, color='none', fontsize=22, 
                 ecolor='steelblue',capsize=0,
                 yerr='errors', legend=False)
    
    # Set title & labels
    plt.title('Coefficients of Features w/ 95% Confidence Intervals',fontsize=30)
    ax.set_ylabel('Coefficients',fontsize=22)
    ax.set_xlabel('',fontsize=22)
    
    # Coefficients
    ax.scatter(x=np.arange(coef_df.shape[0]), 
               marker='o', s=80, 
               y=coef_df['coef'], color='steelblue')
    
    # Line to define zero on the y-axis
    ax.axhline(y=0, linestyle='--', color='red', linewidth=1)
    
    return plt.show()

In [6]:
# compute out-of-sample R-squared using the test set
def OSR2(model, df_train, df_test, dependent_var):   
    y_test = df_test[dependent_var]
    y_pred = model.predict(df_test)
    SSE = np.sum((y_test - y_pred)**2)
    SST = np.sum((y_test - np.mean(df_train[dependent_var]))**2)    
    return 1 - SSE/SST

In [7]:
# compute the out-of-sample R squared
print(OSR2(modOld,camry_train, camry_testA, 'CamrySales'))
print(OSR2(modOld,camry_train, camry_testB, 'CamrySales'))

0.45069736412770733
-0.15909607101438872
