## Dealing with Categorical Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()

In [2]:
raw_data = pd.read_csv('reference/1.03. Dummies.csv')

In [3]:
raw_data.head()

Unnamed: 0,SAT,GPA,Attendance
0,1714,2.4,No
1,1664,2.52,No
2,1760,2.54,No
3,1685,2.74,No
4,1693,2.83,No


In [4]:
data = raw_data.copy()

In [5]:
data['Attendance'] = data['Attendance'].map({'Yes': 1, 'No': 0})

In [6]:
data.head()

Unnamed: 0,SAT,GPA,Attendance
0,1714,2.4,0
1,1664,2.52,0
2,1760,2.54,0
3,1685,2.74,0
4,1693,2.83,0


In [7]:
data.describe()

Unnamed: 0,SAT,GPA,Attendance
count,84.0,84.0,84.0
mean,1845.27381,3.330238,0.464286
std,104.530661,0.271617,0.501718
min,1634.0,2.4,0.0
25%,1772.0,3.19,0.0
50%,1846.0,3.38,0.0
75%,1934.0,3.5025,1.0
max,2050.0,3.81,1.0


### Regression

In [9]:
y = data['GPA']
feature_set = data[['SAT', 'Attendance']]
x = sm.add_constant(feature_set)
results = sm.OLS(y, x).fit()
results.summary()

0,1,2,3
Dep. Variable:,GPA,R-squared:,0.565
Model:,OLS,Adj. R-squared:,0.555
Method:,Least Squares,F-statistic:,52.7
Date:,"Wed, 15 Jul 2020",Prob (F-statistic):,2.19e-15
Time:,05:29:43,Log-Likelihood:,25.798
No. Observations:,84,AIC:,-45.6
Df Residuals:,81,BIC:,-38.3
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.6439,0.358,1.797,0.076,-0.069,1.357
SAT,0.0014,0.000,7.141,0.000,0.001,0.002
Attendance,0.2226,0.041,5.451,0.000,0.141,0.304

0,1,2,3
Omnibus:,19.56,Durbin-Watson:,1.009
Prob(Omnibus):,0.0,Jarque-Bera (JB):,27.189
Skew:,-1.028,Prob(JB):,1.25e-06
Kurtosis:,4.881,Cond. No.,33500.0


## Making Predictions

In [10]:
x

Unnamed: 0,const,SAT,Attendance
0,1.0,1714,0
1,1.0,1664,0
2,1.0,1760,0
3,1.0,1685,0
4,1.0,1693,0
5,1.0,1670,0
6,1.0,1764,0
7,1.0,1764,0
8,1.0,1792,0
9,1.0,1850,0


In [15]:
new_data = pd.DataFrame({'const':1, 'SAT':[1700,1670], 'Attendance':[0,1]})
new_data

Unnamed: 0,const,SAT,Attendance
0,1,1700,0
1,1,1670,1


In [26]:
new_data.rename(index = {0: 'Bob', 1: 'Alice'})

Unnamed: 0,const,SAT,Attendance
Bob,1,1700,0
Alice,1,1670,1


In [24]:
predictions = results.predict(new_data)
predictions

0    3.023513
1    3.204163
dtype: float64

In [28]:
new_data2 = new_data.copy()
new_data2['Predictions'] = predictions
new_data2.rename(index = {0: 'Bob', 1: 'Alice'})

Unnamed: 0,const,SAT,Attendance,Predictions
Bob,1,1700,0,3.023513
Alice,1,1670,1,3.204163
