In [1]:
# Get data
# !wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

#### Data Preparation

In [2]:
PATH = "../data/data.csv"
data = pd.read_csv(PATH)
data.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [3]:
data.columns

Index(['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Driven_Wheels',
       'Number of Doors', 'Market Category', 'Vehicle Size', 'Vehicle Style',
       'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
      dtype='object')

In [4]:
select_cols = ["Make", "Model", "Year", "Engine HP", "Engine Cylinders", "Transmission Type", "Vehicle Style", "highway MPG", "city mpg", "MSRP"] 
data = data[select_cols]
data.columns = data.columns.str.replace(' ', '_').str.lower()
data.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [5]:
data.rename(columns={"msrp": "price"}, inplace=True)
data.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [6]:
data.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
price                 0
dtype: int64

In [7]:
data.describe()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
count,11914.0,11845.0,11884.0,11914.0,11914.0,11914.0
mean,2010.384338,249.38607,5.628829,26.637485,19.733255,40594.74
std,7.57974,109.19187,1.780559,8.863001,8.987798,60109.1
min,1990.0,55.0,0.0,12.0,7.0,2000.0
25%,2007.0,170.0,4.0,22.0,16.0,21000.0
50%,2015.0,227.0,6.0,26.0,18.0,29995.0
75%,2016.0,300.0,6.0,30.0,22.0,42231.25
max,2017.0,1001.0,16.0,354.0,137.0,2065902.0


In [8]:
data.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int64
dtype: object

In [9]:
data.fillna(0, inplace=True)
data.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
price                0
dtype: int64

### Question 1

What is the most frequent observation (mode) for the column `transmission_type`?

- `AUTOMATIC`
- `MANUAL`
- `AUTOMATED_MANUAL`
- `DIRECT_DRIVE`

In [10]:
data.transmission_type.mode()

0    AUTOMATIC
Name: transmission_type, dtype: object

- Answer 1: 'AUTOMATIC'

### Question 2

Create the [correlation matrix](https://www.google.com/search?q=correlation+matrix) for the numerical features of your dataset. 
In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

What are the two features that have the biggest correlation in this dataset?

- `engine_hp` and `year`
- `engine_hp` and `engine_cylinders`
- `highway_mpg` and `engine_cylinders`
- `highway_mpg` and `city_mpg`

In [11]:
data[['engine_hp']].corrwith(data.year)

engine_hp    0.338714
dtype: float64

In [12]:
data[['engine_hp']].corrwith(data.engine_cylinders)

engine_hp    0.774851
dtype: float64

In [13]:
data[['highway_mpg']].corrwith(data.engine_cylinders)

highway_mpg   -0.614541
dtype: float64

In [14]:
data[['highway_mpg']].corrwith(data.city_mpg)

highway_mpg    0.886829
dtype: float64

In [15]:
numeric_columns = ['engine_hp', 'year', 'engine_cylinders', 'highway_mpg', 'city_mpg']
data[numeric_columns].corr()

Unnamed: 0,engine_hp,year,engine_cylinders,highway_mpg,city_mpg
engine_hp,1.0,0.338714,0.774851,-0.415707,-0.424918
year,0.338714,1.0,-0.040708,0.25824,0.198171
engine_cylinders,0.774851,-0.040708,1.0,-0.614541,-0.587306
highway_mpg,-0.415707,0.25824,-0.614541,1.0,0.886829
city_mpg,-0.424918,0.198171,-0.587306,0.886829,1.0


- Answer 2: hightway_mpg and city_mpg

In [16]:
# Make price binary
df = data.copy()
# df['price'] = np.where(df['price']>=df['price'].mean(),1,0)
df["above_average"] = df.price.apply(lambda x: x > df.price.mean()).astype(int)

df.drop("price", axis=1, inplace=True)
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,0


In [17]:
# split_data
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df, test_size=0.25, random_state=42)

In [18]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [19]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

### Question 3

* Calculate the mutual information score between `above_average` and other categorical variables in our dataset. 
  Use the training set only.
* Round the scores to 2 decimals using `round(score, 2)`.

Which of these variables has the lowest mutual information score?
  
- `make`
- `model`
- `transmission_type`
- `vehicle_style`

In [20]:
category_columns = ['make', 'model', 'transmission_type', 'vehicle_style']

def calculate_mi(series):
    score = mutual_info_score(series, df_train.above_average)
    return round(score, 2)

#@ IMPLEMENTATION:
df_mi = df_train[category_columns].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
model,0.46
make,0.24
vehicle_style,0.08
transmission_type,0.02


- Answer 3: transmission_type

### Question 4

* Now let's train a logistic regression.
* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - `model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)`
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

- 0.60
- 0.72
- 0.84
- 0.95

In [41]:
df_train[category_columns + numeric_columns]

Unnamed: 0,make,model,transmission_type,vehicle_style,engine_hp,year,engine_cylinders,highway_mpg,city_mpg
0,Lotus,Evora,MANUAL,Coupe,276.0,2013,6.0,26,18
1,Maserati,GranTurismo,AUTOMATIC,Coupe,454.0,2016,8.0,21,13
2,Toyota,Prius c,AUTOMATIC,4dr Hatchback,99.0,2014,4.0,46,53
3,Alfa Romeo,4C,AUTOMATED_MANUAL,Convertible,237.0,2015,4.0,34,24
4,Volkswagen,GTI,MANUAL,4dr Hatchback,200.0,2012,4.0,31,21
...,...,...,...,...,...,...,...,...,...
8930,Toyota,Venza,AUTOMATIC,Wagon,181.0,2014,4.0,26,20
8931,Pontiac,G6,AUTOMATIC,Sedan,219.0,2009,6.0,26,17
8932,Volkswagen,Golf GTI,AUTOMATED_MANUAL,2dr Hatchback,220.0,2016,4.0,33,25
8933,Saab,9-5,AUTOMATIC,Wagon,260.0,2009,4.0,27,17


In [22]:
train_dict = df_train[category_columns + numeric_columns].to_dict(orient="records")

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)

In [23]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [24]:
val_dict = df_val[category_columns + numeric_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)
y_pred = model.predict(X_val)

accuracy = np.round(accuracy_score(y_val, y_pred),2)
print(accuracy)

0.93


- Answer 4: ~0.93

### Question 5 

* Let's find the least useful feature using the *feature elimination* technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature. 

Which of following feature has the smallest difference?

- `year`
- `engine_hp`
- `transmission_type`
- `city_mpg`

> **Note**: the difference doesn't have to be positive

In [31]:
category_columns, numeric_columns

(['make', 'model', 'transmission_type', 'vehicle_style'],
 ['engine_hp', 'year', 'engine_cylinders', 'highway_mpg', 'city_mpg'])

In [25]:
df_train[category_columns + numeric_columns]

Unnamed: 0,make,model,transmission_type,vehicle_style,engine_hp,year,engine_cylinders,highway_mpg,city_mpg
0,Lotus,Evora,MANUAL,Coupe,276.0,2013,6.0,26,18
1,Maserati,GranTurismo,AUTOMATIC,Coupe,454.0,2016,8.0,21,13
2,Toyota,Prius c,AUTOMATIC,4dr Hatchback,99.0,2014,4.0,46,53
3,Alfa Romeo,4C,AUTOMATED_MANUAL,Convertible,237.0,2015,4.0,34,24
4,Volkswagen,GTI,MANUAL,4dr Hatchback,200.0,2012,4.0,31,21
...,...,...,...,...,...,...,...,...,...
8930,Toyota,Venza,AUTOMATIC,Wagon,181.0,2014,4.0,26,20
8931,Pontiac,G6,AUTOMATIC,Sedan,219.0,2009,6.0,26,17
8932,Volkswagen,Golf GTI,AUTOMATED_MANUAL,2dr Hatchback,220.0,2016,4.0,33,25
8933,Saab,9-5,AUTOMATIC,Wagon,260.0,2009,4.0,27,17


In [37]:
features = category_columns + numeric_columns
features


['make',
 'model',
 'transmission_type',
 'vehicle_style',
 'engine_hp',
 'year',
 'engine_cylinders',
 'highway_mpg',
 'city_mpg']

In [39]:
orig_score = accuracy

for col in features:
    subset = features.copy()
    subset.remove(col)
    
    train_dict = df_train[subset].to_dict(orient="records")
    
    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)
    
    X_train = dv.transform(train_dict)
    
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    val_dict = df_val[subset].to_dict(orient="records")
    X_val = dv.transform(val_dict)
    
    y_pred = model.predict(X_val)
    
    score = accuracy_score(y_val, y_pred)
    print(col, orig_score - score, score)

make -0.015619335347431984 0.945619335347432
model 0.009221215172876884 0.9207787848271232
transmission_type -0.00957703927492437 0.9395770392749244
vehicle_style -0.008569989929506527 0.9385699899295066
engine_hp -0.005548841893252665 0.9355488418932527
year -0.020654582074521644 0.9506545820745217
engine_cylinders -0.018640483383685735 0.9486404833836858
highway_mpg -0.014276602886874712 0.9442766028868748
city_mpg -0.012598187311178233 0.9425981873111783


- Answer 5: engine_hp

### Question 6

* For this question, we'll see how to use a linear regression model from Scikit-Learn.
* We'll need to use the original column `price`. Apply the logarithmic transformation to this column.
* Fit the Ridge regression model on the training data with a solver `'sag'`. Set the seed to `42`.
* This model also has a parameter `alpha`. Let's try the following values: `[0, 0.01, 0.1, 1, 10]`.
* Round your RMSE scores to 3 decimal digits.

Which of these alphas leads to the best RMSE on the validation set?

- 0
- 0.01
- 0.1
- 1
- 10

> **Note**: If there are multiple options, select the smallest `alpha`.


In [42]:
data.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [43]:
# Normalize the data
data['price'] = np.log1p(data['price'])
data.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,10.739349
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,10.612779
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,10.500977
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,10.290483
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,10.448744


In [44]:
# Split the data
df_train_full, df_test = train_test_split(data, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [45]:
# Prepare the data
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [48]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

In [49]:
del df_train['price']
del df_val['price']
del df_test['price']

In [50]:
train_dict = df_train[category_columns + numeric_columns].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

X_train = dv.transform(train_dict)

val_dict = df_val[category_columns + numeric_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [51]:
for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    score = np.sqrt(mean_squared_error(y_val, y_pred))
    
    print(a, round(score, 4))



0 0.4868




0.01 0.4868




0.1 0.4868




1 0.4868
10 0.487




- Answer 6: 0

## Submit the results

* Submit your results here: https://forms.gle/FFfNjEP4jU4rxnL26
* You can submit your solution multiple times. In this case, only the last submission will be used 
* If your answer doesn't match options exactly, select the closest one