In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, Ridge

### Dataset

In [2]:
df_ = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv')

In [3]:
df = df_.copy()

### Data preparation

Select only the features from above and transform their names 

In [4]:
columns = ['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders', 
           'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg', 'MSRP']

df = df[columns]

In [5]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

Fill in the missing values of the selected features with 0.


In [6]:
df.fillna(0, inplace=True)

Rename MSRP variable to price.


In [7]:
df.rename(columns={'msrp': 'price'}, inplace=True)

### Q1

What is the most frequent observation (mode) for the column transmission_type?

In [8]:
df['transmission_type'].mode()[0]

'AUTOMATIC'

### Q2

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

What are the two features that have the biggest correlation in this dataset?

In [9]:
df[['engine_hp', 'year', 'engine_cylinders', 'highway_mpg', 'city_mpg']].corr().stack().nlargest(10)

engine_hp         engine_hp           1.000000
year              year                1.000000
engine_cylinders  engine_cylinders    1.000000
highway_mpg       highway_mpg         1.000000
city_mpg          city_mpg            1.000000
highway_mpg       city_mpg            0.886829
city_mpg          highway_mpg         0.886829
engine_hp         engine_cylinders    0.774851
engine_cylinders  engine_hp           0.774851
engine_hp         year                0.338714
dtype: float64

### Make price binary

Let's create a variable above_average which is 1 if the price is above its mean value and 0 otherwise.

In [10]:
round(df['price'].mean())

40595

In [11]:
df['above_average'] = (df['price'] > df['price'].mean()).astype(int)

In [12]:
df.drop('price', axis=1, inplace=True)

### Split the data

Split your data in train/val/test sets with 60%/20%/20% distribution.

Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.

Make sure that the target value (above_average) is not in your dataframe.

In [13]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [14]:
X_train = df_train.drop('above_average', axis=1)
y_train = df_train['above_average']

X_val = df_val.drop('above_average', axis=1)
y_val = df_val['above_average']

X_test = df_test.drop('above_average', axis=1)
y_test = df_test['above_average']

### Q3

Calculate the mutual information score between above_average and other categorical variables in our dataset. Use the training set only.

Round the scores to 2 decimals using round(score, 2).

Which of these variables has the lowest mutual information score?

In [15]:
categorical_columns = [col for col in X_train.columns if X_train[col].dtype == 'O']

In [16]:
for col in categorical_columns:
    print(f'Mutual information score for {col}: {round(mutual_info_score(X_train[col], y_train), 2)}')

Mutual information score for make: 0.24
Mutual information score for model: 0.46
Mutual information score for transmission_type: 0.02
Mutual information score for vehicle_style: 0.08


### Q4

Fit the model on the training dataset.

To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:

model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [17]:
dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(X_train.to_dict(orient='records'))
X_val = dv.transform(X_val.to_dict(orient='records'))

In [18]:
lr = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [19]:
lr.fit(X_train, y_train)

y_pred = lr.predict_proba(X_val)[:, 1]

In [20]:
decisions = y_pred >= 0.5

accuracy_score = round((y_val == decisions).mean(), 2)
accuracy_score

0.95

### Q5

Train a model with all these features (using the same parameters as in Q4).

Now exclude each feature from this set and train a model without it. Record the accuracy for each model.

For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

Which of following feature has the smallest difference?

year, engine_hp, transmission_type, city_mpg

In [21]:
exclude_columns = ['year', 'engine_hp', 'transmission_type', 'city_mpg']

In [22]:
X_train = df_train.drop('above_average', axis=1)
y_train = df_train['above_average']

X_val = df_val.drop('above_average', axis=1)
y_val = df_val['above_average']

X_test = df_test.drop('above_average', axis=1)
y_test = df_test['above_average']

In [23]:
dv = DictVectorizer(sparse=False)
lr = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [24]:
for col in exclude_columns:
    X_train_ = X_train.drop(col, axis=1)
    X_val_ = X_val.drop(col, axis=1)

    X_train_ = dv.fit_transform(X_train_.to_dict(orient='records'))
    X_val_ = dv.transform(X_val_.to_dict(orient='records'))

    lr.fit(X_train_, y_train)
    y_pred = lr.predict_proba(X_val_)[:, 1]
    decisions = y_pred >= 0.5
    print(f'Excluding {col} from the model: {accuracy_score - (y_val == decisions).mean()} difference in accuracy score')

Excluding year from the model: 0.0016156105749055572 difference in accuracy score
Excluding engine_hp from the model: 0.025115400755350348 difference in accuracy score
Excluding transmission_type from the model: 0.004133445237096023 difference in accuracy score
Excluding city_mpg from the model: 0.01756189676877884 difference in accuracy score


### Q6

We'll need to use the original column price. Apply the logarithmic transformation to this column.

Fit the Ridge regression model on the training data with a solver 'sag'. Set the seed to 42.

This model also has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10].

Round your RMSE scores to 3 decimal digits.

In [4]:
alphas = [0, 0.01, 0.1, 1, 10]

In [5]:
df = df_.copy()

columns = ['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders', 
           'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg', 'MSRP']

df = df[columns]

df.columns = df.columns.str.lower().str.replace(' ', '_')

df.fillna(0, inplace=True)

df.rename(columns={'msrp': 'price'}, inplace=True)

In [6]:
df['price'] = np.log1p(df['price'])

In [8]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

X_train = df_train.drop('price', axis=1)
y_train = df_train['price']

X_val = df_val.drop('price', axis=1)
y_val = df_val['price']

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(X_train.to_dict(orient='records'))
X_val = dv.transform(X_val.to_dict(orient='records'))

In [9]:
for i in alphas:
    rr = Ridge(alpha=i, solver='sag', random_state=42)
    rr.fit(X_train, y_train)
    y_pred = rr.predict(X_val)
    print(f'Alpha: {i}, RMSE: {round(np.sqrt(((y_val - y_pred) ** 2).mean()), 3)}')

Alpha: 0, RMSE: 0.494
Alpha: 0.01, RMSE: 0.494
Alpha: 0.1, RMSE: 0.494
Alpha: 1, RMSE: 0.494
Alpha: 10, RMSE: 0.494
