# Exercise 13

This particular Automobile Data Set includes a good mix of categorical values as well as continuous values and serves as a useful example that is relatively easy to understand. Since domain understanding is an important aspect when deciding how to encode various categorical values - this data set makes a good case study.

Read the data into Pandas

In [76]:
import pandas as pd
import numpy as np

# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [77]:
df.shape

(205, 26)

In [78]:
df.dtypes

symboling              int64
normalized_losses    float64
make                  object
fuel_type             object
aspiration            object
num_doors             object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_cylinders         object
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
dtype: object

In [79]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi


# Exercise 13.1

Does the database contain missing values? If so, replace them using one of the methods explained in class

In [80]:
pd.unique(obj_df["num_cylinders"])

array(['four', 'six', 'five', 'three', 'twelve', 'two', 'eight'],
      dtype=object)

In [81]:
dict_cyli = {'two':2,'three':3,'four':4,'five':5,'six':6,'eight':8,'twelve':12}
df.replace({'num_cylinders':dict_cyli}, inplace=True)

In [82]:
df.isnull().sum()

symboling             0
normalized_losses    41
make                  0
fuel_type             0
aspiration            0
num_doors             2
body_style            0
drive_wheels          0
engine_location       0
wheel_base            0
length                0
width                 0
height                0
curb_weight           0
engine_type           0
num_cylinders         0
engine_size           0
fuel_system           0
bore                  4
stroke                4
compression_ratio     0
horsepower            2
peak_rpm              2
city_mpg              0
highway_mpg           0
price                 4
dtype: int64

In [83]:
#Num Doors
pd.unique(obj_df["num_doors"])

array(['two', 'four', nan], dtype=object)

In [84]:
dict_doors = {'two':2,'four':4}
df.replace({'num_doors':dict_doors}, inplace=True)

In [85]:
a=df.num_doors.mode()
a

0    4.0
dtype: float64

In [86]:
df.num_doors.fillna(4, inplace=True)

In [87]:
#Price
df.dropna(subset=['price'],inplace = True)

In [88]:
#normalized_losses
df.normalized_losses.fillna(df.normalized_losses.median(), inplace=True)

In [89]:
#bore
df.bore.fillna(df.bore.mean(), inplace=True)

In [90]:
#stroke
df.stroke.fillna(df.stroke.median(), inplace=True)

In [91]:
#horsepower
df.horsepower.fillna(df.horsepower.mean(), inplace=True)

In [92]:
#peak rpm
df.peak_rpm.fillna(df.peak_rpm.mean(), inplace=True)

In [93]:
df.isnull().sum()

symboling            0
normalized_losses    0
make                 0
fuel_type            0
aspiration           0
num_doors            0
body_style           0
drive_wheels         0
engine_location      0
wheel_base           0
length               0
width                0
height               0
curb_weight          0
engine_type          0
num_cylinders        0
engine_size          0
fuel_system          0
bore                 0
stroke               0
compression_ratio    0
horsepower           0
peak_rpm             0
city_mpg             0
highway_mpg          0
price                0
dtype: int64

In [94]:
df.shape

(201, 26)

FIN

# Exercise 13.2

Split the data into training and testing sets

Train a Random Forest Regressor to predict the price of a car using the nominal features

In [95]:
num_df = df.select_dtypes(exclude=['object']).copy()
num_df.head()

Unnamed: 0,symboling,normalized_losses,num_doors,wheel_base,length,width,height,curb_weight,num_cylinders,engine_size,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,115.0,2.0,88.6,168.8,64.1,48.8,2548,4,130,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,115.0,2.0,88.6,168.8,64.1,48.8,2548,4,130,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,115.0,2.0,94.5,171.2,65.5,52.4,2823,6,152,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,4.0,99.8,176.6,66.2,54.3,2337,4,109,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,4.0,99.4,176.6,66.4,54.3,2824,5,136,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [96]:
num_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201 entries, 0 to 204
Data columns (total 18 columns):
symboling            201 non-null int64
normalized_losses    201 non-null float64
num_doors            201 non-null float64
wheel_base           201 non-null float64
length               201 non-null float64
width                201 non-null float64
height               201 non-null float64
curb_weight          201 non-null int64
num_cylinders        201 non-null int64
engine_size          201 non-null int64
bore                 201 non-null float64
stroke               201 non-null float64
compression_ratio    201 non-null float64
horsepower           201 non-null float64
peak_rpm             201 non-null float64
city_mpg             201 non-null int64
highway_mpg          201 non-null int64
price                201 non-null float64
dtypes: float64(12), int64(6)
memory usage: 29.8 KB


In [97]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [98]:
y = num_df['price']
X = num_df.drop(['price'], axis=1)

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [100]:
clf=RandomForestRegressor(n_estimators=200,max_depth=None,max_features=None,random_state=42)

In [101]:
clf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [102]:
y_pred = clf.predict(X_test)
y_pred

array([ 7246.72      , 30693.735     ,  6231.815     ,  8067.725     ,
       13470.71708333,  6018.2       , 32472.14      , 10578.135     ,
       17722.45      , 33652.17      , 23987.73      ,  8284.50166667,
       10890.5175    ,  9459.45916667, 14760.30666667,  7821.81      ,
       10171.135     ,  7516.07      ,  8782.04166667, 34402.09      ,
       29238.85833333, 32916.205     ,  6487.01      ,  7177.34      ,
       24258.795     ,  8993.6875    ,  9517.1775    , 26131.505     ,
       29238.85833333, 17033.57541667, 15009.8       ,  5522.38      ,
       14109.4375    ,  6833.525     ,  7489.28833333, 15939.11      ,
        6881.025     ,  7760.95      ,  9366.425     ,  8440.47      ,
       13547.86      , 19861.035     ,  9715.41      ,  7094.77      ,
       15410.7575    , 18889.6       ,  5729.66      ,  5912.05      ,
       10369.24166667,  6519.91      , 17205.86      , 17946.705     ,
       13326.99833333, 12353.24208333,  6486.32      ,  6014.94      ,
      

In [103]:
print("MSE: ",metrics.mean_squared_error(y_pred,y_test))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_pred, y_test)))

MSE:  6736497.550660324
RMSE: 2595.4763629554254


In [104]:
dict_res = {'Metodo':'Escalares','RMSE': np.sqrt(metrics.mean_squared_error(y_pred, y_test)),'Variables':X.shape[1]}
res = pd.DataFrame([dict_res], columns=dict_res.keys())

In [105]:
res

Unnamed: 0,Metodo,RMSE,Variables
0,Escalares,2595.476363,17


FIN

# Exercise 13.3

Create dummy variables for the categorical features

Train a Random Forest Regressor and compare

In [106]:
obj_df = df.select_dtypes(include=['object']).copy()

In [107]:
obj_df.shape

(201, 8)

In [108]:
obj_df.head()

Unnamed: 0,make,fuel_type,aspiration,body_style,drive_wheels,engine_location,engine_type,fuel_system
0,alfa-romero,gas,std,convertible,rwd,front,dohc,mpfi
1,alfa-romero,gas,std,convertible,rwd,front,dohc,mpfi
2,alfa-romero,gas,std,hatchback,rwd,front,ohcv,mpfi
3,audi,gas,std,sedan,fwd,front,ohc,mpfi
4,audi,gas,std,sedan,4wd,front,ohc,mpfi


In [109]:
print("Cant make: ",len(pd.unique(obj_df["make"])))
print("Cant fuel_type: ",len(pd.unique(obj_df["fuel_type"])))
print("Cant aspiration: ",len(pd.unique(obj_df["aspiration"])))
print("Cant body_style: ",len(pd.unique(obj_df["body_style"])))
print("Cant drive_wheels: ",len(pd.unique(obj_df["drive_wheels"])))
print("Cant engine_location: ",len(pd.unique(obj_df["engine_location"])))
print("Cant engine_type: ",len(pd.unique(obj_df["engine_type"])))
print("Cant fuel_system: ",len(pd.unique(obj_df["fuel_system"])))

Cant make:  22
Cant fuel_type:  2
Cant aspiration:  2
Cant body_style:  5
Cant drive_wheels:  3
Cant engine_location:  2
Cant engine_type:  6
Cant fuel_system:  8


In [110]:
obj_df_dummies=pd.get_dummies(obj_df)

In [111]:
obj_df_dummies.head()

Unnamed: 0,make_alfa-romero,make_audi,make_bmw,make_chevrolet,make_dodge,make_honda,make_isuzu,make_jaguar,make_mazda,make_mercedes-benz,...,engine_type_ohcv,engine_type_rotor,fuel_system_1bbl,fuel_system_2bbl,fuel_system_4bbl,fuel_system_idi,fuel_system_mfi,fuel_system_mpfi,fuel_system_spdi,fuel_system_spfi
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [112]:
# concatenate the original DataFrame and the dummy DataFrame
df1 = pd.concat([num_df, obj_df_dummies], axis=1)

In [113]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201 entries, 0 to 204
Data columns (total 68 columns):
symboling                 201 non-null int64
normalized_losses         201 non-null float64
num_doors                 201 non-null float64
wheel_base                201 non-null float64
length                    201 non-null float64
width                     201 non-null float64
height                    201 non-null float64
curb_weight               201 non-null int64
num_cylinders             201 non-null int64
engine_size               201 non-null int64
bore                      201 non-null float64
stroke                    201 non-null float64
compression_ratio         201 non-null float64
horsepower                201 non-null float64
peak_rpm                  201 non-null float64
city_mpg                  201 non-null int64
highway_mpg               201 non-null int64
price                     201 non-null float64
make_alfa-romero          201 non-null uint8
make_audi       

In [114]:
y1 = df1['price']
X1 = df1.drop(['price'], axis=1)

In [115]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.30, random_state=42)

In [116]:
clf1=RandomForestRegressor(n_estimators=200,max_depth=None,max_features=None,random_state=42)

In [117]:
clf1.fit(X_train1, y_train1)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [118]:
y_pred1 = clf1.predict(X_test1)
y_pred1

array([ 7256.64      , 31115.78      ,  6212.395     ,  7955.29      ,
       13276.09666667,  5999.605     , 32887.38      , 10567.015     ,
       17638.215     , 33126.835     , 23515.28      ,  8216.53      ,
       11150.68      ,  9547.29416667, 14795.8125    ,  7818.015     ,
       10303.39166667,  7525.755     ,  8752.92833333, 34593.205     ,
       29311.93583333, 32353.515     ,  6344.08      ,  7201.135     ,
       23683.215     ,  9024.98833333,  9522.32916667, 25966.19      ,
       29105.87083333, 16846.785     , 14884.68      ,  5498.105     ,
       13589.4325    ,  6709.67      ,  7529.10666667, 15974.6       ,
        6856.655     ,  7742.085     ,  9357.01166667,  8365.04      ,
       14298.06      , 19579.55      ,  9695.835     ,  7122.065     ,
       15116.34083333, 18751.16      ,  5735.56      ,  6170.345     ,
       10330.66      ,  6560.805     , 17104.395     , 17639.24      ,
       13311.305     , 12253.56416667,  6469.605     ,  5994.545     ,
      

In [119]:
print("MSE: ",metrics.mean_squared_error(y_pred1,y_test1))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_pred1, y_test1)))

MSE:  7014755.155937034
RMSE: 2648.538305544595


In [120]:
dict_res = {'Metodo':'Dummies','RMSE': np.sqrt(metrics.mean_squared_error(y_pred1, y_test1)),'Variables':X1.shape[1]}

In [121]:
res1 = pd.DataFrame([dict_res], columns=dict_res.keys())

In [122]:
res=res.append(res1, ignore_index = True) 

In [123]:
res

Unnamed: 0,Metodo,RMSE,Variables
0,Escalares,2595.476363,17
1,Dummies,2648.538306,67


Se concluye que incluir todas las variables como dummies no es una mejora al modelo.

fin

# Exercise 13.4

Apply two other methods of categorical encoding

compare the results

In [124]:
!pip install category_encoders



twisted 18.7.0 requires PyHamcrest>=1.9.0, which is not installed.
distributed 1.21.8 requires msgpack, which is not installed.
You are using pip version 10.0.1, however version 19.0.3 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [125]:
import category_encoders as ce

In [126]:
obj_df_bi = ce.BinaryEncoder().fit_transform(obj_df)

In [127]:
obj_df_bi.head()

Unnamed: 0,make_0,make_1,make_2,make_3,make_4,make_5,fuel_type_0,fuel_type_1,aspiration_0,aspiration_1,...,engine_location_0,engine_location_1,engine_type_0,engine_type_1,engine_type_2,engine_type_3,fuel_system_0,fuel_system_1,fuel_system_2,fuel_system_3
0,0,0,0,0,0,1,0,1,0,1,...,0,1,0,0,0,1,0,0,0,1
1,0,0,0,0,0,1,0,1,0,1,...,0,1,0,0,0,1,0,0,0,1
2,0,0,0,0,0,1,0,1,0,1,...,0,1,0,0,1,0,0,0,0,1
3,0,0,0,0,1,0,0,1,0,1,...,0,1,0,0,1,1,0,0,0,1
4,0,0,0,0,1,0,0,1,0,1,...,0,1,0,0,1,1,0,0,0,1


In [128]:
df2 = pd.concat([num_df, obj_df_bi], axis=1)

In [129]:
df2.head()

Unnamed: 0,symboling,normalized_losses,num_doors,wheel_base,length,width,height,curb_weight,num_cylinders,engine_size,...,engine_location_0,engine_location_1,engine_type_0,engine_type_1,engine_type_2,engine_type_3,fuel_system_0,fuel_system_1,fuel_system_2,fuel_system_3
0,3,115.0,2.0,88.6,168.8,64.1,48.8,2548,4,130,...,0,1,0,0,0,1,0,0,0,1
1,3,115.0,2.0,88.6,168.8,64.1,48.8,2548,4,130,...,0,1,0,0,0,1,0,0,0,1
2,1,115.0,2.0,94.5,171.2,65.5,52.4,2823,6,152,...,0,1,0,0,1,0,0,0,0,1
3,2,164.0,4.0,99.8,176.6,66.2,54.3,2337,4,109,...,0,1,0,0,1,1,0,0,0,1
4,2,164.0,4.0,99.4,176.6,66.4,54.3,2824,5,136,...,0,1,0,0,1,1,0,0,0,1


In [130]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201 entries, 0 to 204
Data columns (total 45 columns):
symboling            201 non-null int64
normalized_losses    201 non-null float64
num_doors            201 non-null float64
wheel_base           201 non-null float64
length               201 non-null float64
width                201 non-null float64
height               201 non-null float64
curb_weight          201 non-null int64
num_cylinders        201 non-null int64
engine_size          201 non-null int64
bore                 201 non-null float64
stroke               201 non-null float64
compression_ratio    201 non-null float64
horsepower           201 non-null float64
peak_rpm             201 non-null float64
city_mpg             201 non-null int64
highway_mpg          201 non-null int64
price                201 non-null float64
make_0               201 non-null int64
make_1               201 non-null int64
make_2               201 non-null int64
make_3               201 non-nu

In [131]:
y2 = df2['price']
X2 = df2.drop(['price'], axis=1)

In [132]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.30, random_state=42)

In [133]:
clf2=RandomForestRegressor(n_estimators=200,max_depth=None,max_features=None,random_state=42)

In [134]:
clf2.fit(X_train2, y_train2)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [135]:
y_pred2 = clf2.predict(X_test2)
y_pred2

array([ 7259.37      , 31130.96      ,  6244.54      ,  8014.14      ,
       13421.13666667,  6040.895     , 32727.21      , 10485.255     ,
       17738.035     , 33854.27      , 23627.765     ,  8307.95833333,
       11318.7525    ,  9445.895     , 14480.78666667,  7864.6       ,
       10216.88      ,  7528.005     ,  8711.29583333, 34604.64      ,
       29506.435     , 32717.535     ,  6371.12      ,  7196.315     ,
       23855.785     ,  8999.56833333,  9666.36166667, 26427.385     ,
       29493.65833333, 16968.44      , 15007.85458333,  5506.175     ,
       13795.2575    ,  6729.635     ,  7466.215     , 16033.81      ,
        6852.225     ,  7739.255     ,  9433.4375    ,  8436.45      ,
       14210.58      , 19669.045     ,  9760.33416667,  7081.885     ,
       15616.56708333, 18472.57      ,  5748.73      ,  6053.02      ,
       10421.865     ,  6521.385     , 17293.48      , 17849.055     ,
       13274.035     , 12227.42291667,  6465.32      ,  6031.935     ,
      

In [136]:
print("MSE: ",metrics.mean_squared_error(y_pred2,y_test2))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_pred2, y_test2)))

MSE:  6491092.115133389
RMSE: 2547.762177899144


In [137]:
dict_res = {'Metodo':'Binary','RMSE': np.sqrt(metrics.mean_squared_error(y_pred2, y_test2)),'Variables':X2.shape[1]}

In [138]:
res2 = pd.DataFrame([dict_res], columns=dict_res.keys())

In [139]:
res=res.append(res2, ignore_index = True) 

In [140]:
res

Unnamed: 0,Metodo,RMSE,Variables
0,Escalares,2595.476363,17
1,Dummies,2648.538306,67
2,Binary,2547.762178,44


HASHING

In [141]:
obj_df_has = ce.HashingEncoder(n_components=8).fit_transform(obj_df)

In [142]:
obj_df_has.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7
0,1,0,0,2,1,2,2,0
1,1,0,0,2,1,2,2,0
2,1,0,0,1,2,2,2,0
3,3,0,0,0,2,1,1,1
4,3,0,0,0,1,1,2,1


In [143]:
df3 = pd.concat([num_df, obj_df_has], axis=1)

In [144]:
df3.head()

Unnamed: 0,symboling,normalized_losses,num_doors,wheel_base,length,width,height,curb_weight,num_cylinders,engine_size,...,highway_mpg,price,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7
0,3,115.0,2.0,88.6,168.8,64.1,48.8,2548,4,130,...,27,13495.0,1,0,0,2,1,2,2,0
1,3,115.0,2.0,88.6,168.8,64.1,48.8,2548,4,130,...,27,16500.0,1,0,0,2,1,2,2,0
2,1,115.0,2.0,94.5,171.2,65.5,52.4,2823,6,152,...,26,16500.0,1,0,0,1,2,2,2,0
3,2,164.0,4.0,99.8,176.6,66.2,54.3,2337,4,109,...,30,13950.0,3,0,0,0,2,1,1,1
4,2,164.0,4.0,99.4,176.6,66.4,54.3,2824,5,136,...,22,17450.0,3,0,0,0,1,1,2,1


In [145]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201 entries, 0 to 204
Data columns (total 26 columns):
symboling            201 non-null int64
normalized_losses    201 non-null float64
num_doors            201 non-null float64
wheel_base           201 non-null float64
length               201 non-null float64
width                201 non-null float64
height               201 non-null float64
curb_weight          201 non-null int64
num_cylinders        201 non-null int64
engine_size          201 non-null int64
bore                 201 non-null float64
stroke               201 non-null float64
compression_ratio    201 non-null float64
horsepower           201 non-null float64
peak_rpm             201 non-null float64
city_mpg             201 non-null int64
highway_mpg          201 non-null int64
price                201 non-null float64
col_0                201 non-null int64
col_1                201 non-null int64
col_2                201 non-null int64
col_3                201 non-nu

In [146]:
y3 = df3['price']
X3 = df3.drop(['price'], axis=1)

In [147]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=0.30, random_state=42)

In [148]:
clf3=RandomForestRegressor(n_estimators=200,max_depth=None,max_features=None,random_state=42)

In [149]:
clf3.fit(X_train3, y_train3)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [150]:
y_pred3 = clf3.predict(X_test3)
y_pred3

array([ 7250.685     , 31211.415     ,  6312.89      ,  8034.235     ,
       13337.0125    ,  6013.595     , 32423.235625  , 10378.75416667,
       17444.87      , 33358.81      , 23744.585     ,  8391.4775    ,
       11052.0425    ,  9439.995     , 14512.73916667,  7797.745     ,
       10361.33916667,  7542.09      ,  8703.85416667, 34167.315     ,
       29610.090625  , 32600.48      ,  6455.        ,  7216.89      ,
       23864.635     ,  8886.08      ,  9795.445     , 25875.48      ,
       29610.090625  , 16708.69854167, 14971.945     ,  5498.14      ,
       13332.0175    ,  6818.115     ,  7485.69916667, 16143.04      ,
        6671.6925    ,  7731.26      ,  9501.12      ,  8523.375     ,
       14226.88      , 19630.87      ,  9765.9825    ,  7095.76      ,
       15268.43083333, 17561.66      ,  5921.025     ,  6060.415     ,
       10242.485     ,  6535.505     , 16935.865     , 17613.405     ,
       12979.94833333, 12157.0175    ,  6573.89      ,  6031.945     ,
      

In [151]:
print("MSE: ",metrics.mean_squared_error(y_pred3,y_test3))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_pred3, y_test3)))

MSE:  6670079.51992297
RMSE: 2582.649709101676


In [152]:
dict_res = {'Metodo':'Hashing','RMSE': np.sqrt(metrics.mean_squared_error(y_pred3, y_test3)),'Variables':X3.shape[1]}

In [153]:
res3 = pd.DataFrame([dict_res], columns=dict_res.keys())

In [154]:
res=res.append(res3, ignore_index = True) 

In [155]:
res

Unnamed: 0,Metodo,RMSE,Variables
0,Escalares,2595.476363,17
1,Dummies,2648.538306,67
2,Binary,2547.762178,44
3,Hashing,2582.649709,25


In [156]:
res.sort_values(by='RMSE', ascending=True)

Unnamed: 0,Metodo,RMSE,Variables
2,Binary,2547.762178,44
3,Hashing,2582.649709,25
0,Escalares,2595.476363,17
1,Dummies,2648.538306,67


FIN