# Dummy Variables & One Hot Encoding

## ie6

### Using pandas to create dummy variables

In [1]:
import pandas as pd
tp = pd.read_csv("townprices.csv")
display(tp)
print(tp.dtypes)
dummies = pd.get_dummies(tp.town)
display(dummies)
merged = pd.concat([tp,dummies], axis="columns")
display(merged)
final = merged.drop(["town", "west windsor"], axis="columns")
display(final)

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


town     object
area      int64
price     int64
dtype: object


Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


### Dummy Variable Trap

In [3]:
X = final.drop(["price"], axis="columns")
print(X)
y = final.price
print(y)

    area  monroe township  robinsville
0   2600                1            0
1   3000                1            0
2   3200                1            0
3   3600                1            0
4   4000                1            0
5   2600                0            0
6   2800                0            0
7   3300                0            0
8   3600                0            0
9   2600                0            1
10  2900                0            1
11  3100                0            1
12  3600                0            1
0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64


In [7]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
model = LinearRegression()
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=3)
# train lm model
model.fit(X_train, y_train)
print(model.coef_)
print(model.intercept_)
print(model.predict([[2800,0,1]]))
print(model.predict([[3400,0,0]]))
# test lm model
print(model.score(X_test,y_test))

[   122.81449893 -30906.18336887 -11504.97512438]
259420.75337599945
[591796.37526653]
[676990.04975124]
0.9115296637502014


In [6]:
# store lm model with pickle
import pickle
with open("model_pickle", "wb") as f:
    pickle.dump(model, f)
# reuse lm model with pickle
with open("model_pickle", "rb") as f:
    mp = pickle.load(f)
print(mp.coef_)
print(mp.intercept_)

# store lm model with joblib
import joblib # for large numpy
joblib.dump(model, "model_joblib")
# reuse lm model with joblib
mj = joblib.load("model_joblib")
print(mj.coef_)
print(mj.intercept_)

[   126.89744141 -40013.97548914 -14327.56396474]
249790.36766292533
[   126.89744141 -40013.97548914 -14327.56396474]
249790.36766292533


### Using sklearn OneHotEncoder

In [8]:
print(tp)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

               town  area   price
0   monroe township  2600  550000
1   monroe township  3000  565000
2   monroe township  3200  610000
3   monroe township  3600  680000
4   monroe township  4000  725000
5      west windsor  2600  585000
6      west windsor  2800  615000
7      west windsor  3300  650000
8      west windsor  3600  710000
9       robinsville  2600  575000
10      robinsville  2900  600000
11      robinsville  3100  620000
12      robinsville  3600  695000


In [9]:
dfle = tp
dfle.town = le.fit_transform(dfle.town) 
dfle

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [11]:
X = dfle[["town", "area"]] # x = dfle[["town", "area"]].values # array
print(x)
y = dfle.price
print(y)

    town  area
0      0  2600
1      0  3000
2      0  3200
3      0  3600
4      0  4000
5      2  2600
6      2  2800
7      2  3300
8      2  3600
9      1  2600
10     1  2900
11     1  3100
12     1  3600
0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64


In [16]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('town', OneHotEncoder(), [0])], remainder = 'passthrough')
X = ct.fit_transform(X_train)
print(X)
X = X[:,1:]
print(X)

[[1.0e+00 0.0e+00 0.0e+00 2.6e+03]
 [1.0e+00 0.0e+00 0.0e+00 3.0e+03]
 [1.0e+00 0.0e+00 0.0e+00 3.2e+03]
 [1.0e+00 0.0e+00 0.0e+00 3.6e+03]
 [1.0e+00 0.0e+00 0.0e+00 4.0e+03]
 [1.0e+00 0.0e+00 1.0e+00 2.6e+03]
 [1.0e+00 0.0e+00 1.0e+00 2.8e+03]
 [1.0e+00 0.0e+00 1.0e+00 3.3e+03]
 [1.0e+00 0.0e+00 1.0e+00 3.6e+03]
 [0.0e+00 1.0e+00 0.0e+00 2.6e+03]
 [0.0e+00 1.0e+00 0.0e+00 2.9e+03]
 [0.0e+00 1.0e+00 0.0e+00 3.1e+03]
 [0.0e+00 1.0e+00 0.0e+00 3.6e+03]]
[[0.0e+00 0.0e+00 2.6e+03]
 [0.0e+00 0.0e+00 3.0e+03]
 [0.0e+00 0.0e+00 3.2e+03]
 [0.0e+00 0.0e+00 3.6e+03]
 [0.0e+00 0.0e+00 4.0e+03]
 [0.0e+00 1.0e+00 2.6e+03]
 [0.0e+00 1.0e+00 2.8e+03]
 [0.0e+00 1.0e+00 3.3e+03]
 [0.0e+00 1.0e+00 3.6e+03]
 [1.0e+00 0.0e+00 2.6e+03]
 [1.0e+00 0.0e+00 2.9e+03]
 [1.0e+00 0.0e+00 3.1e+03]
 [1.0e+00 0.0e+00 3.6e+03]]


In [22]:
model.fit(X_train, y_train)
print(model.coef_)
print(model.intercept_)
print(model.predict([[1,0,2800]])) # 2800 sqr ft home in robbinsville
print(model.predict([[10,1,2800]])) # 3400 sqr ft home in west windsor
model.score(X_test, y_test)

[   122.81449893 -30906.18336887 -11504.97512438]
259420.75337599945
[-31954386.78038377]
[-31984187.63326224]


0.9115296637502014

## asgn6

In [23]:
import pandas as pd
cp = pd.read_csv("carprices.csv")
print(cp)
print(cp.dtypes)

                Car Model  Mileage  Sell Price($)  Age(yrs)
0                  BMW X5    69000          18000         6
1                  BMW X5    35000          34000         3
2                  BMW X5    57000          26100         5
3                  BMW X5    22500          40000         2
4                  BMW X5    46000          31500         4
5                 Audi A5    59000          29400         5
6                 Audi A5    52000          32000         5
7                 Audi A5    72000          19300         6
8                 Audi A5    91000          12000         8
9   Mercedez Benz C class    67000          22000         6
10  Mercedez Benz C class    83000          20000         7
11  Mercedez Benz C class    79000          21000         7
12  Mercedez Benz C class    59000          33000         5
Car Model        object
Mileage           int64
Sell Price($)     int64
Age(yrs)          int64
dtype: object


### Using pandas to create dummy variables

In [24]:
dummies = pd.get_dummies(cp["Car Model"])
print(dummies)
merged = pd.concat([cp, dummies], axis = "columns")
print(merged)
final = merged.drop(["Car Model", "Mercedez Benz C class"], axis="columns")
print(final)

    Audi A5  BMW X5  Mercedez Benz C class
0         0       1                      0
1         0       1                      0
2         0       1                      0
3         0       1                      0
4         0       1                      0
5         1       0                      0
6         1       0                      0
7         1       0                      0
8         1       0                      0
9         0       0                      1
10        0       0                      1
11        0       0                      1
12        0       0                      1
                Car Model  Mileage  Sell Price($)  Age(yrs)  Audi A5  BMW X5  \
0                  BMW X5    69000          18000         6        0       1   
1                  BMW X5    35000          34000         3        0       1   
2                  BMW X5    57000          26100         5        0       1   
3                  BMW X5    22500          40000         2        0       1  

### Dummy Variable Trap

In [25]:
X = final.drop(["Sell Price($)"], axis="columns")
y = final["Sell Price($)"]
print(X)
print(y)

    Mileage  Age(yrs)  Audi A5  BMW X5
0     69000         6        0       1
1     35000         3        0       1
2     57000         5        0       1
3     22500         2        0       1
4     46000         4        0       1
5     59000         5        1       0
6     52000         5        1       0
7     72000         6        1       0
8     91000         8        1       0
9     67000         6        0       0
10    83000         7        0       0
11    79000         7        0       0
12    59000         5        0       0
0     18000
1     34000
2     26100
3     40000
4     31500
5     29400
6     32000
7     19300
8     12000
9     22000
10    20000
11    21000
12    33000
Name: Sell Price($), dtype: int64


In [28]:
from sklearn.linear_model import LinearRegression
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=2)
# train lm model
model = LinearRegression()
model.fit(X_train, y_train)
# predict
print(model.coef_)
print(model.intercept_)
print("Price of mercedez benz that is 4 yr old with mileage 45000 is", "$" +
      str(int(model.predict([[45000,4,0,0]]))))
print("Price of BMW X5 that is 7 yr old with mileage 86000 is", "$" +
      str(int(model.predict([[86000,7,0,1]]))))
# test lm model
print(model.score(X_test, y_test))

[-4.42706847e-01 -5.26524608e+02 -2.08454886e+03 -6.72572218e+03]
58744.11554921539
Price of mercedez benz that is 4 yr old with mileage 45000 is $36716
Price of BMW X5 that is 7 yr old with mileage 86000 is $10259
0.9364004003147443


In [100]:
# store and download model
import pickle
with open("model_6.dv_ohe", "wb") as f:
    pickle.dump(model, f)
with open("model_6.dv_ohe", "rb") as f:
    mp = pickle.load(f)
print(mp.predict([[45000,4,0,0]]))
print(mp.predict([[86000,7,0,1]]))
import joblib
joblib.dump(model, "model_6.dv_ohe")
mj = joblib.load("model_6.dv_ohe")
print(mj.predict([[45000,4,0,0]]))
print(mj.predict([[86000,7,0,1]]))

[36991.31721061]
[11080.74313219]
[36991.31721061]
[11080.74313219]


### Using sklearn OneHotEncoder

In [29]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [31]:
dfle = cp
dfle["Car Model"] = le.fit_transform(dfle["Car Model"])
dfle

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,1,69000,18000,6
1,1,35000,34000,3
2,1,57000,26100,5
3,1,22500,40000,2
4,1,46000,31500,4
5,0,59000,29400,5
6,0,52000,32000,5
7,0,72000,19300,6
8,0,91000,12000,8
9,2,67000,22000,6


In [40]:
X = dfle[["Car Model","Mileage","Age(yrs)"]] # x = dfle[["Car Model","Mileage","Age(yrs)"]] # array
print(X)
print(type(X))
y = dfle[["Sell Price($)"]]
print(y)

[[    1 69000     6]
 [    1 35000     3]
 [    1 57000     5]
 [    1 22500     2]
 [    1 46000     4]
 [    0 59000     5]
 [    0 52000     5]
 [    0 72000     6]
 [    0 91000     8]
 [    2 67000     6]
 [    2 83000     7]
 [    2 79000     7]
 [    2 59000     5]]
<class 'numpy.ndarray'>
    Sell Price($)
0           18000
1           34000
2           26100
3           40000
4           31500
5           29400
6           32000
7           19300
8           12000
9           22000
10          20000
11          21000
12          33000


In [39]:
ColumnTransformer?

[0;31mInit signature:[0m
[0mColumnTransformer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtransformers[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mremainder[0m[0;34m=[0m[0;34m'drop'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msparse_threshold[0m[0;34m=[0m[0;36m0.3[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_jobs[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtransformer_weights[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose_feature_names_out[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Applies transformers to columns of an array or pandas DataFrame.

This estimator allows different columns or column subsets of the input
to be transformed separately and the features generated by each transfor

In [42]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('Car Model', OneHotEncoder(), [0])], remainder = 'passthrough')
X = ct.fit_transform(x)
print(X)
X = X[:,1:]
print(X)

[[1.0e+00 0.0e+00 0.0e+00 2.6e+03]
 [1.0e+00 0.0e+00 0.0e+00 3.0e+03]
 [1.0e+00 0.0e+00 0.0e+00 3.2e+03]
 [1.0e+00 0.0e+00 0.0e+00 3.6e+03]
 [1.0e+00 0.0e+00 0.0e+00 4.0e+03]
 [0.0e+00 0.0e+00 1.0e+00 2.6e+03]
 [0.0e+00 0.0e+00 1.0e+00 2.8e+03]
 [0.0e+00 0.0e+00 1.0e+00 3.3e+03]
 [0.0e+00 0.0e+00 1.0e+00 3.6e+03]
 [0.0e+00 1.0e+00 0.0e+00 2.6e+03]
 [0.0e+00 1.0e+00 0.0e+00 2.9e+03]
 [0.0e+00 1.0e+00 0.0e+00 3.1e+03]
 [0.0e+00 1.0e+00 0.0e+00 3.6e+03]]
[[0.0e+00 0.0e+00 2.6e+03]
 [0.0e+00 0.0e+00 3.0e+03]
 [0.0e+00 0.0e+00 3.2e+03]
 [0.0e+00 0.0e+00 3.6e+03]
 [0.0e+00 0.0e+00 4.0e+03]
 [0.0e+00 1.0e+00 2.6e+03]
 [0.0e+00 1.0e+00 2.8e+03]
 [0.0e+00 1.0e+00 3.3e+03]
 [0.0e+00 1.0e+00 3.6e+03]
 [1.0e+00 0.0e+00 2.6e+03]
 [1.0e+00 0.0e+00 2.9e+03]
 [1.0e+00 0.0e+00 3.1e+03]
 [1.0e+00 0.0e+00 3.6e+03]]


In [43]:
model.fit(X_train, y_train)
print(model.predict([[0,1,45000,4]]))
print(model.predict([[1,0,86000,7]]))
print(model.score(X_test, y_test))

[-93773383.9425821]
[-1.79259538e+08]
0.9364004003147443
