In [19]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [20]:
cars = pd.read_csv('data.csv')

In [21]:
cars.head().T


Unnamed: 0,0,1,2,3,4
Make,BMW,BMW,BMW,BMW,BMW
Model,1 Series M,1 Series,1 Series,1 Series,1 Series
Year,2011,2011,2011,2011,2011
Engine Fuel Type,premium unleaded (required),premium unleaded (required),premium unleaded (required),premium unleaded (required),premium unleaded (required)
Engine HP,335.0,300.0,300.0,230.0,230.0
Engine Cylinders,6.0,6.0,6.0,6.0,6.0
Transmission Type,MANUAL,MANUAL,MANUAL,MANUAL,MANUAL
Driven_Wheels,rear wheel drive,rear wheel drive,rear wheel drive,rear wheel drive,rear wheel drive
Number of Doors,2.0,2.0,2.0,2.0,2.0
Market Category,"Factory Tuner,Luxury,High-Performance","Luxury,Performance","Luxury,High-Performance","Luxury,Performance",Luxury


In [22]:
cars.dtypes


Make                  object
Model                 object
Year                   int64
Engine Fuel Type      object
Engine HP            float64
Engine Cylinders     float64
Transmission Type     object
Driven_Wheels         object
Number of Doors      float64
Market Category       object
Vehicle Size          object
Vehicle Style         object
highway MPG            int64
city mpg               int64
Popularity             int64
MSRP                   int64
dtype: object

In [23]:
cars.columns = cars.columns.str.lower().str.replace(' ', '_')
string_columns = list(cars.dtypes[cars.dtypes == 'object'].index)
for col in string_columns: 
    cars[col] = cars[col].str.lower().str.replace(' ', '_')

In [24]:
cars.columns


Index(['make', 'model', 'year', 'engine_fuel_type', 'engine_hp',
       'engine_cylinders', 'transmission_type', 'driven_wheels',
       'number_of_doors', 'market_category', 'vehicle_size', 'vehicle_style',
       'highway_mpg', 'city_mpg', 'popularity', 'msrp'],
      dtype='object')

In [25]:
cars = cars.rename(columns={'msrp': 'price'})

In [26]:
columns = ['make', 'model', 'year', 'engine_hp', 'engine_cylinders', 'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg', 'price']
cars = cars[columns]

In [27]:
cars = cars.fillna(0)

In [28]:
frecuencies = cars['transmission_type'].value_counts()

In [29]:
frecuencies


automatic           8266
manual              2935
automated_manual     626
direct_drive          68
unknown               19
Name: transmission_type, dtype: int64

In [30]:
cars.isnull().sum()


make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
price                0
dtype: int64

In [31]:
correlation = cars.corr()

In [32]:
correlation

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


In [33]:
average_price = cars['price'].mean()

cars['above_average'] = (cars['price'] > average_price).astype(int)

In [34]:
cars

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,bmw,1_series_m,2011,335.0,6.0,manual,coupe,26,19,46135,1
1,bmw,1_series,2011,300.0,6.0,manual,convertible,28,19,40650,1
2,bmw,1_series,2011,300.0,6.0,manual,coupe,28,20,36350,0
3,bmw,1_series,2011,230.0,6.0,manual,coupe,28,18,29450,0
4,bmw,1_series,2011,230.0,6.0,manual,convertible,28,18,34500,0
...,...,...,...,...,...,...,...,...,...,...,...
11909,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,46120,1
11910,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,56670,1
11911,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,50620,1
11912,acura,zdx,2013,300.0,6.0,automatic,4dr_hatchback,23,16,50920,1


In [35]:
from sklearn.model_selection import train_test_split

In [43]:
df_train_full, df_test = train_test_split(cars, test_size=0.2, random_state=42)

In [44]:
df_train_full.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
3181,cadillac,ct6,2016,265.0,4.0,automatic,sedan,31,22,53495,1
5357,mercedes-benz,gls-class,2017,449.0,8.0,automatic,4dr_suv,18,14,93850,1
4874,kia,forte,2016,173.0,4.0,automatic,coupe,34,25,19890,0
8102,dodge,ram_250,1993,180.0,6.0,manual,regular_cab_pickup,16,11,2000,0
10400,hyundai,tiburon,2008,172.0,6.0,automatic,2dr_hatchback,24,17,21270,0


In [45]:
df_train, df_val = train_test_split(df_train_full, test_size=0.2,random_state=42)

In [46]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
del df_train['above_average']
del df_val['above_average']

In [47]:
df_train_full.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
price                0
above_average        0
dtype: int64

In [48]:
df_train_full.above_average.value_counts()

0    6893
1    2638
Name: above_average, dtype: int64

In [49]:
global_mean = df_train_full.above_average.mean()

In [50]:
global_mean


0.2767810303221068

In [51]:
df_train_full.head().T

Unnamed: 0,3181,5357,4874,8102,10400
make,cadillac,mercedes-benz,kia,dodge,hyundai
model,ct6,gls-class,forte,ram_250,tiburon
year,2016,2017,2016,1993,2008
engine_hp,265.0,449.0,173.0,180.0,172.0
engine_cylinders,4.0,8.0,4.0,6.0,6.0
transmission_type,automatic,automatic,automatic,manual,automatic
vehicle_style,sedan,4dr_suv,coupe,regular_cab_pickup,2dr_hatchback
highway_mpg,31,18,34,16,24
city_mpg,22,14,25,11,17
price,53495,93850,19890,2000,21270


In [52]:
categorical=['make', 'model', 'transmission_type', 'vehicle_style']
numerical = ['year', 'engine', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'price', 'above_average']

In [53]:
df_train_full[categorical].nunique()

make                  48
model                902
transmission_type      5
vehicle_style         16
dtype: int64