In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [2]:
# !python -m wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

In [3]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


### Data preparation

In [4]:
features = ['Make',
            'Model',
            'Year',
            'Engine HP',
            'Engine Cylinders',
            'Transmission Type',
            'Vehicle Style',
            'highway MPG',
            'city mpg',
            'MSRP']

In [5]:
data = data[features]
data.columns = data.columns.str.replace(' ', '_').str.lower()

In [6]:
data.isna().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [7]:
data = data.fillna(0)

In [8]:
data = data.rename(columns={'msrp': 'price'})
data.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


### Question 1

In [9]:
data['transmission_type'].mode()

0    AUTOMATIC
Name: transmission_type, dtype: object

### Question 2

In [10]:
numerical = data.select_dtypes(include=np.number).columns.to_list()
data[numerical].corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


### Make `price` binary

In [11]:
new_data = data.copy()
new_data['above_average'] = np.where(new_data['price'] > new_data['price'].mean(), 1, 0)
new_data = new_data.drop('price', axis=1)
new_data.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,0


### Split the data

In [12]:
train_data, test_data = train_test_split(new_data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)

len(train_data), len(val_data), len(test_data)

(7148, 2383, 2383)

In [13]:
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [14]:
y_train = train_data['above_average'].values
y_val = val_data['above_average'].values
y_test = test_data['above_average'].values

In [15]:
del train_data['above_average']
del val_data['above_average']
del test_data['above_average']

### Question 3

In [16]:
categorical = train_data.select_dtypes(include='object').columns.to_list()
train_data[categorical].head()

Unnamed: 0,make,model,transmission_type,vehicle_style
0,Mitsubishi,Endeavor,AUTOMATIC,4dr SUV
1,Kia,Borrego,AUTOMATIC,4dr SUV
2,Lamborghini,Gallardo,MANUAL,Convertible
3,Chevrolet,Colorado,AUTOMATIC,Crew Cab Pickup
4,Pontiac,Vibe,AUTOMATIC,4dr Hatchback


In [17]:
for c in categorical:
    score = mutual_info_score(train_data[c], y_train)
    print(f'{c}: {round(score, 2)}')

make: 0.24
model: 0.46
transmission_type: 0.02
vehicle_style: 0.08


### Question 4

In [18]:
train_dicts = train_data.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)
X_train = dv.transform(train_dicts)
X_train.shape

(7148, 943)

In [19]:
val_dicts = val_data.to_dict(orient='records')
X_val = dv.transform(val_dicts)
X_val.shape

(2383, 943)

In [20]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [21]:
model.fit(X_train, y_train)

In [22]:
y_pred = model.predict(X_val)
score = accuracy_score(y_val, y_pred, normalize=True)
round(score, 2)

0.95

### Question 5

In [23]:
dv = DictVectorizer(sparse=False)
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [24]:
for c in train_data.columns:
    X_train_ = train_data.drop(c, axis=1)
    X_val_ = val_data.drop(c, axis=1)
    
    train_dicts = X_train_.to_dict(orient='records')
    val_dicts = X_val_.to_dict(orient='records')
    
    X_train_ = dv.fit_transform(train_dicts)
    X_val_ = dv.transform(val_dicts)
    
    model.fit(X_train_, y_train)
    
    y_pred_ = model.predict(X_val_)
    new_score = accuracy_score(y_val, y_pred_, normalize=True)

    print(f'{c}: {score - new_score}')

make: -0.0016785564414603105
model: 0.020981955518254325
year: -0.002937473772555599
engine_hp: 0.01720520352496857
engine_cylinders: -0.0008392782207301552
transmission_type: 0.0
vehicle_style: 0.012589173310952662
highway_mpg: -0.0016785564414603105
city_mpg: 0.012589173310952662


### Question 6

In [25]:
data['price'] = np.log1p(data['price'])

In [26]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)

X_train = train_data.drop('price', axis=1)
y_train = train_data['price']

X_val = val_data.drop('price', axis=1)
y_val = val_data['price']

dv = DictVectorizer(sparse=False)
train_dicts = X_train.to_dict(orient='records')
val_dicts = X_val.to_dict(orient='records')

X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [27]:
alphas = [0, 0.01, 0.1, 1, 10]

for a in alphas:
    model = Ridge(solver='sag', alpha=a, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = mean_squared_error(y_val, y_pred, squared=False)
    print(f'{a}: {round(score, 3)}')