# Dummy variables and predictions

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
raw_data = pd.read_csv('./data/dummies.csv')
raw_data

Unnamed: 0,SAT,GPA,Attendance
0,1714,2.40,No
1,1664,2.52,No
2,1760,2.54,No
3,1685,2.74,No
4,1693,2.83,No
...,...,...,...
79,1936,3.71,Yes
80,1810,3.71,Yes
81,1987,3.73,No
82,1962,3.76,Yes


In [3]:
# use dummy var to transform categorical No/Yes to 0/1
data = raw_data.copy()
data['Attendance'] = data['Attendance'].map({'Yes': 1, 'No': 0})
data.describe()

Unnamed: 0,SAT,GPA,Attendance
count,84.0,84.0,84.0
mean,1845.27381,3.330238,0.464286
std,104.530661,0.271617,0.501718
min,1634.0,2.4,0.0
25%,1772.0,3.19,0.0
50%,1846.0,3.38,0.0
75%,1934.0,3.5025,1.0
max,2050.0,3.81,1.0


In [4]:
# Regression
y = data['GPA']
x1 = data[['SAT', 'Attendance']]
x = sm.add_constant(x1)
results = sm.OLS(y,x).fit()
results.summary()

0,1,2,3
Dep. Variable:,GPA,R-squared:,0.565
Model:,OLS,Adj. R-squared:,0.555
Method:,Least Squares,F-statistic:,52.7
Date:,"Wed, 17 May 2023",Prob (F-statistic):,2.19e-15
Time:,16:23:07,Log-Likelihood:,25.798
No. Observations:,84,AIC:,-45.6
Df Residuals:,81,BIC:,-38.3
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.6439,0.358,1.797,0.076,-0.069,1.357
SAT,0.0014,0.000,7.141,0.000,0.001,0.002
Attendance,0.2226,0.041,5.451,0.000,0.141,0.304

0,1,2,3
Omnibus:,19.56,Durbin-Watson:,1.009
Prob(Omnibus):,0.0,Jarque-Bera (JB):,27.189
Skew:,-1.028,Prob(JB):,1.25e-06
Kurtosis:,4.881,Cond. No.,33500.0


In [5]:
# Predictions
# create data frame from x
new_data = pd.DataFrame({'const':1, 'SAT': [1700, 1670], 'Attendance': [0,1]})
new_data = new_data[['const', 'SAT', 'Attendance']] # override default alphabetical order
new_data.rename(index={0: 'Bob', 1: 'Alice'})
predictions = results.predict(new_data)
predictions

0    3.023513
1    3.204163
dtype: float64

In [7]:
predictionsdf = pd.DataFrame({'Predictions':predictions})
joined = new_data.join(predictionsdf)
joined.rename

<bound method DataFrame.rename of    const   SAT  Attendance  Predictions
0      1  1700           0     3.023513
1      1  1670           1     3.204163>