In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

In [3]:
!wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

--2023-09-30 16:01:40--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1475504 (1.4M) [text/plain]
Saving to: ‘data.csv.1’


2023-09-30 16:01:41 (1.08 MB/s) - ‘data.csv.1’ saved [1475504/1475504]



In [4]:
df = pd.read_csv('data.csv')

features = ['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders',
            'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg']

df = df[features+['MSRP']]

df.columns = df.columns.str.replace(' ', '_').str.lower()

df[list(df.columns)] = df[list(df.columns)].fillna(0)

df.columns = ['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'price']

df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


# Question 1: AUTOMATIC
What is the most frequent observation (mode) for the column transmission_type?

* AUTOMATIC
* MANUAL
* AUTOMATED_MANUAL
* DIRECT_DRIVE

In [5]:
df['transmission_type'].value_counts()

AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: transmission_type, dtype: int64

# Question 2: engine_hp, engine_cylinders
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

What are the two features that have the biggest correlation in this dataset?

* engine_hp and year
* engine_hp and engine_cylinders
* highway_mpg and engine_cylinders
* highway_mpg and city_mpg

In [6]:
numerical = ['engine_hp', 'year', 'engine_cylinders', 'highway_mpg', 'city_mpg']

df[numerical].corr()

Unnamed: 0,engine_hp,year,engine_cylinders,highway_mpg,city_mpg
engine_hp,1.0,0.338714,0.774851,-0.415707,-0.424918
year,0.338714,1.0,-0.040708,0.25824,0.198171
engine_cylinders,0.774851,-0.040708,1.0,-0.614541,-0.587306
highway_mpg,-0.415707,0.25824,-0.614541,1.0,0.886829
city_mpg,-0.424918,0.198171,-0.587306,0.886829,1.0


# Make price binary

Now we need to turn the price variable from numeric into a binary format.
Let's create a variable above_average which is 1 if the price is above its mean value and 0 otherwise.

* Now we need to turn the price variable from numeric into a binary format.
* Let's create a variable above_average which is 1 if the price is above its mean value and 0 otherwise.

In [7]:
price_mean = np.mean(df['price'].tolist())
df['above_average'] = [1 if x >= price_mean else 0 for x in df['price'].tolist()]

In [8]:
df['above_average'].value_counts()

0    8645
1    3269
Name: above_average, dtype: int64

# Split the data

* Split your data in train/val/test sets with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
* Make sure that the target value (price) is not in your dataframe.

In [9]:
# test is 20%
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# full train = 80%, final train is 60% and validation is 20%
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [10]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [11]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

del df_train['price']
del df_val['price']
del df_test['price']

# Question 3, transmission_type

* Calculate the mutual information score between above_average and other categorical variables in our dataset. 
* Use the training set only.
* Round the scores to 2 decimals using round(score, 2).

Which of these variables has the lowest mutual information score?

* make
* model
* transmission_type
* vehicle_style

In [12]:
categorical = ['make', 'model', 'transmission_type', 'vehicle_style']

In [13]:
def mutual_info_price_score(series):
    return mutual_info_score(series, df_full_train.above_average)

In [14]:
mi = df_full_train[categorical].apply(mutual_info_price_score)
mi.sort_values(ascending=False)

model                0.460994
make                 0.238724
vehicle_style        0.083390
transmission_type    0.020884
dtype: float64

# Question 4, 0.95 (or 0.93)

Now let's train a logistic regression.

* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
* To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
* model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

    
What accuracy did you get?

* 0.60
* 0.72
* 0.84
* 0.95

In [15]:
# one-hot encoding for categorical 
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')

X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')

X_val = dv.transform(val_dict)

In [16]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [17]:
# accuracy, round to 2 numbers
y_pred = model.predict_proba(X_val)[:, 1] #prob of churn, soft predictions 
price_decision = (y_pred >= 0.5) # make a hard decision on val dataset
np.round((y_val == price_decision).mean(),2)

0.93

In [18]:
categorical 

['make', 'model', 'transmission_type', 'vehicle_style']

In [19]:
numerical

['engine_hp', 'year', 'engine_cylinders', 'highway_mpg', 'city_mpg']

In [20]:
X_train

array([[1.500e+01, 6.000e+00, 2.250e+02, ..., 0.000e+00, 0.000e+00,
        2.011e+03],
       [1.700e+01, 6.000e+00, 2.760e+02, ..., 0.000e+00, 0.000e+00,
        2.009e+03],
       [1.200e+01, 1.000e+01, 5.700e+02, ..., 0.000e+00, 0.000e+00,
        2.012e+03],
       ...,
       [1.700e+01, 6.000e+00, 2.600e+02, ..., 0.000e+00, 0.000e+00,
        2.012e+03],
       [1.900e+01, 4.000e+00, 1.360e+02, ..., 0.000e+00, 0.000e+00,
        1.993e+03],
       [1.700e+01, 6.000e+00, 3.650e+02, ..., 1.000e+00, 0.000e+00,
        2.015e+03]])

# Question 5: engine_hp

Let's find the least useful feature using the feature elimination technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
* Which of following feature has the smallest difference?

Options: 
* year
* engine_hp
* transmission_type
* city_mpg

Note: the difference doesn't have to be positive

In [28]:
global_above_average = df_full_train.above_average.mean()
global_above_average

0.2767810303221068

In [27]:
for c in categorical:
    print(c)
    df_group = df_full_train.groupby(c).above_average.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_above_average
    df_group['risk'] = df_group['mean'] / global_above_average
    display(df_group)
    print()
    print()

make


Unnamed: 0_level_0,mean,count,diff,risk
make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acura,0.391089,202,0.114308,1.412991
Alfa Romeo,1.0,5,0.723219,3.612964
Aston Martin,1.0,74,0.723219,3.612964
Audi,0.654412,272,0.377631,2.364366
BMW,0.822064,281,0.545283,2.970088
Bentley,1.0,55,0.723219,3.612964
Bugatti,1.0,3,0.723219,3.612964
Buick,0.123377,154,-0.153404,0.445755
Cadillac,0.881988,322,0.605207,3.18659
Chevrolet,0.181313,899,-0.095468,0.655076




model


Unnamed: 0_level_0,mean,count,diff,risk
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Series,0.416667,12,0.139886,1.505402
100,0.000000,11,-0.276781,0.000000
124 Spider,0.000000,2,-0.276781,0.000000
190-Class,0.000000,4,-0.276781,0.000000
2,0.000000,10,-0.276781,0.000000
...,...,...,...,...
iQ,0.000000,3,-0.276781,0.000000
tC,0.000000,13,-0.276781,0.000000
xA,0.000000,5,-0.276781,0.000000
xB,0.000000,7,-0.276781,0.000000




transmission_type


Unnamed: 0_level_0,mean,count,diff,risk
transmission_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AUTOMATED_MANUAL,0.474206,504,0.197425,1.713291
AUTOMATIC,0.310017,6619,0.033236,1.120079
DIRECT_DRIVE,0.458333,48,0.181552,1.655942
MANUAL,0.138652,2344,-0.138129,0.500944
UNKNOWN,0.0,16,-0.276781,0.0




vehicle_style


Unnamed: 0_level_0,mean,count,diff,risk
vehicle_style,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2dr Hatchback,0.0,421,-0.276781,0.0
2dr SUV,0.035398,113,-0.241383,0.127893
4dr Hatchback,0.046099,564,-0.230682,0.166555
4dr SUV,0.374619,1970,0.097838,1.353486
Cargo Minivan,0.0,60,-0.276781,0.0
Cargo Van,0.0,73,-0.276781,0.0
Convertible,0.55538,632,0.278599,2.006567
Convertible SUV,0.153846,26,-0.122935,0.555841
Coupe,0.496257,935,0.219476,1.792958
Crew Cab Pickup,0.337017,543,0.060236,1.217629






In [29]:
def prepare_data(features):
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[features].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    val_dict = df_val[features].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    # accuracy, round to 2 numbers
    y_pred = model.predict_proba(X_val)[:, 1] #prob of churn, soft predictions 
    price_decision = (y_pred >= 0.5) # make a hard decision on val dataset
    return np.round((y_val == price_decision).mean(),2)

In [40]:
features_li = categorical + numerical
full_model_accuracy = prepare_data(features_li)
full_model_accuracy

0.93

In [None]:
year:  -0.02 
engine_hp: 0.0
transmission_type -0.02
city_mpg -0.02

In [43]:
for feature in features_li: 
    
    l = features_li.copy()
    l.remove(feature)
    #print (l, 'absent feature:', feature)
    print ('absent feature:', feature, ';', 'accuracy diff:', np.round(full_model_accuracy-prepare_data(l), 4))

absent feature: make ; accuracy diff: -0.02
absent feature: model ; accuracy diff: 0.01
absent feature: transmission_type ; accuracy diff: -0.02
absent feature: vehicle_style ; accuracy diff: 0.0
absent feature: engine_hp ; accuracy diff: 0.0
absent feature: year ; accuracy diff: -0.02
absent feature: engine_cylinders ; accuracy diff: -0.02
absent feature: highway_mpg ; accuracy diff: -0.02
absent feature: city_mpg ; accuracy diff: -0.02


# Question 6
For this question, we'll see how to use a linear regression model from Scikit-Learn.
* We'll need to use the original column price. Apply the logarithmic transformation to this column.
* Fit the Ridge regression model on the training data with a solver 'sag'. Set the seed to 42.
* This model also has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10].
* Round your RMSE scores to 3 decimal digits.
* Which of these alphas leads to the best RMSE on the validation set?

Options
* 0
* 0.01
* 0.1
* 1
* 10
    
Note: If there are multiple options, select the smallest alpha.



In [73]:
df = pd.read_csv('data.csv')

features = ['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders',
            'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg']

df = df[features+['MSRP']]

df.columns = df.columns.str.replace(' ', '_').str.lower()

df[list(df.columns)] = df[list(df.columns)].fillna(0)

df.columns = ['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'price']


df['price'] = np.log1p(df['price'])

In [74]:
df[0:3]

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,10.739349
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,10.612779
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,10.500977


In [75]:
# test is 20%
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# full train = 80%, final train is 60% and validation is 20%
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

#del df_train['above_average']
#del df_val['above_average']
#del df_test['above_average']

del df_train['price']
del df_val['price']
del df_test['price']

In [76]:
df_train[0:2]

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
0,Mitsubishi,Endeavor,2011,225.0,6.0,AUTOMATIC,4dr SUV,19,15
1,Kia,Borrego,2009,276.0,6.0,AUTOMATIC,4dr SUV,21,17


In [77]:
y_train[0:3]

array([10.42228135, 10.17526888, 12.42118806])

In [85]:
features = categorical + numerical

dv = DictVectorizer(sparse=False)

train_dict = df_full_train[features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
y_train = df_full_train.price.values
    
val_dict = df_val[features].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [88]:
X_train[0:2]

array([[2.200e+01, 4.000e+00, 2.650e+02, ..., 1.000e+00, 0.000e+00,
        2.016e+03],
       [1.400e+01, 8.000e+00, 4.490e+02, ..., 0.000e+00, 0.000e+00,
        2.017e+03]])

In [87]:
y_train[0:2]

array([10.88736216, 11.4494637 ])

In [81]:
features

['make',
 'model',
 'transmission_type',
 'vehicle_style',
 'engine_hp',
 'year',
 'engine_cylinders',
 'highway_mpg',
 'city_mpg']

Fit the Ridge regression model on the training data with a solver 'sag'. Set the seed to 42.

This model also has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10].

In [83]:
X_train[0:2]

array([[  15.,    6.,  225., ...,    0.,    0., 2011.],
       [  17.,    6.,  276., ...,    0.,    0., 2009.]])

In [84]:
y_train[0:2]

array([10.42228135, 10.17526888])

In [90]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [None]:
for alpha in [0, 0.01, 0.1, 1, 10]:
    clf = Ridge(alpha=alpha, solver='sag', random_state=42)
    clf.fit(X_train, y_train)

    # RMSE on validation: round to 3 decimal digits. 
    y_pred = clf.predict(X_val) #prediction on val  
    #rmse = np.round(mean_squared_error(y_val, y_pred, squared=False),4)

    print ('alpha: %s, rmse: %s' % (alpha, rmse(y_val, y_pred)))



alpha: 0, rmse: 0.47666390339106063




alpha: 0.01, rmse: 0.47666418445654185




alpha: 0.1, rmse: 0.4766667139432609


In [None]:
# Which of these alphas leads to the best RMSE on the validation set? 

In [67]:
y_pred

array([10.23197934, 11.00719493,  9.22024756, ...,  9.03745341,
       11.54439939, 10.84847629])

In [68]:
y_val

array([4.45750343, 4.77959649, 4.30297994, ..., 4.34183006, 5.09235214,
       4.72402997])