In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import mean_squared_error, mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

### get the data

In [145]:
!wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

--2023-09-30 16:17:05--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 

200 OK
Length: 1475504 (1,4M) [text/plain]
Saving to: ‘data.csv.2’


2023-09-30 16:17:06 (3,27 MB/s) - ‘data.csv.2’ saved [1475504/1475504]



In [195]:
df = pd.read_csv("data.csv")
df

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,46120
11910,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,56670
11911,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,50620
11912,Acura,ZDX,2013,premium unleaded (recommended),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,50920


### Features

In [196]:
# For the rest of the homework, you'll need to use only these columns:
features = ["Make", "Model", "Year", "Engine HP", "Engine Cylinders", "Transmission Type", "Vehicle Style", "highway MPG", "city mpg"]
target = df.MSRP

In [197]:
df = df[features]

In [211]:
df.columns = df.columns.str.replace(' ', '_').str.lower()
df

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18
...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16


In [212]:
# Fill in the missing values of the selected features with 0.
df.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
dtype: int64

In [213]:
df = df.fillna(0)

In [217]:
df.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
dtype: int64

In [215]:
# Rename MSRP variable to price.
price = target

### question #1

In [154]:
# What is the most frequent observation (mode) for the column transmission_type?

In [155]:
df.transmission_type.describe()

count         11914
unique            5
top       AUTOMATIC
freq           8266
Name: transmission_type, dtype: object

`AUTOMATIC`

### question #2

In [156]:
# Create the correlation matrix for the numerical features of your dataset. 
# In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
numerical_features = ["year", "engine_hp", "engine_cylinders", "highway_mpg", "city_mpg"]

In [157]:
correlation_matrix = df[numerical_features].corr()
correlation_matrix

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
year,1.0,0.338714,-0.040708,0.25824,0.198171
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0


In [158]:
# What are the two features that have the biggest correlation in this dataset?

`highway_mpg` and `city_mpg`

#### Make price binary

In [159]:
# Now we need to turn the price variable from numeric into a binary format.
# Let's create a variable above_average which is 1 if the price is above its mean value and 0 otherwise.

In [160]:
above_average = [1 if x > np.mean(price) else 0 for x in price]

#### Split the data

In [161]:
# Split your data in train/val/test sets with 60%/20%/20% distribution.
# Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
# Make sure that the target value (above_average) is not in your dataframe.

In [163]:
X_train_full, X_test, y_train_full, y_test = train_test_split(df, above_average, test_size=0.2, random_state=42)

In [164]:
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)

In [165]:
X_train.shape, X_val.shape, X_test.shape

((7148, 9), (2383, 9), (2383, 9))

### question #3

In [166]:
# Calculate the mutual information score between above_average and other categorical variables in our dataset. Use the training set only.
# Round the scores to 2 decimals using round(score, 2).

In [168]:
df.head(1)

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19


In [169]:
categorical = ["make", "model", "transmission_type", "vehicle_style"]

In [170]:
for item in categorical:
    print(item, round(mutual_info_score(X_train_full[item], y_train_full), 2))

make 0.24
model 0.46
transmission_type 0.02
vehicle_style 0.08


In [171]:
# Which of these variables has the lowest mutual information score?
# "make", "model", "transmission_type", "vehicle_style"

`transmission_type`

### question #4

In [172]:
# Now let's train a logistic regression.
# Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.

In [173]:
dv = DictVectorizer(sparse=False)

In [174]:
train_dicts = X_train.to_dict(orient="records")
X_train = dv.fit_transform(train_dicts)

In [175]:
val_dicts = X_val.to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [1]:
# Fit the model on the training dataset.
#     To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
#     model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [177]:
lr = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [178]:
lr.fit(X_train, y_train)

In [179]:
# Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
preds = lr.predict(X_val)
round((preds == y_val).mean(), 2)

0.93

In [180]:
(preds == y_val).mean()

0.9345362987830466

In [181]:
# What accuracy did you get?

`0.93`

### question #5

In [182]:
# Let's find the least useful feature using the feature elimination technique.
# Train a model with all these features (using the same parameters as in Q4).
# Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
X_train_full.shape

(9531, 9)

In [183]:
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)

In [184]:
X_train.head(1)

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
3972,Mitsubishi,Endeavor,2011,225.0,6.0,AUTOMATIC,4dr SUV,19,15


In [185]:
X_train_dict = X_train.to_dict(orient="records")
X_train_dict = dv.fit_transform(X_train_dict)
X_train_dict.shape

(7148, 943)

In [186]:
X_val_dict = X_val.to_dict(orient="records")
X_val_dict = dv.transform(X_val_dict)
X_val_dict.shape

(2383, 943)

In [187]:
lr.fit(X_train_dict, y_train)
y_pred = lr.predict(X_val_dict)

In [188]:
accuracy_full = (y_pred == y_val).mean()
accuracy_full

0.9345362987830466

In [189]:
feat = [
    "year",
    "engine_hp",
    "transmission_type",
    "city_mpg",
]


In [190]:
for feature in feat:
    # remove it from features
    df = X_train.copy()
    df = df.drop(feature, axis=1)
    df_dict = df.to_dict(orient="records")
    dv = DictVectorizer(sparse=False)
    df_train_dict = dv.fit_transform(df_dict)
    df_val = X_val.copy()
    df_val = df_val.to_dict(orient="records")
    df_val_dict = dv.transform(df_val)
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(df_train_dict, y_train)
    y_pred = model.predict(df_val_dict)
    # For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
    acc_feature = (y_pred == y_val).mean()
    print(feature, round((accuracy_full - acc_feature), 5))

year -0.01385


engine_hp -0.00042
transmission_type -0.01049
city_mpg -0.01133


In [191]:
# Which of following feature has the smallest difference?

`engine_hp`

### question #6

In [192]:
# For this question, we'll see how to use a linear regression model from Scikit-Learn.
# We'll need to use the original column price. Apply the logarithmic transformation to this column.
new_price = np.log1p(price)


In [218]:
df

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18
...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16


In [219]:
X_train_full, X_test, y_train_full, y_test = train_test_split(df, new_price, test_size=0.2, random_state=42)

In [220]:
X_train_full

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
3181,Cadillac,CT6,2016,265.0,4.0,AUTOMATIC,Sedan,31,22
5357,Mercedes-Benz,GLS-Class,2017,449.0,8.0,AUTOMATIC,4dr SUV,18,14
4874,Kia,Forte,2016,173.0,4.0,AUTOMATIC,Coupe,34,25
8102,Dodge,RAM 250,1993,180.0,6.0,MANUAL,Regular Cab Pickup,16,11
10400,Hyundai,Tiburon,2008,172.0,6.0,AUTOMATIC,2dr Hatchback,24,17
...,...,...,...,...,...,...,...,...,...
11284,Toyota,Venza,2014,181.0,4.0,AUTOMATIC,Wagon,26,20
5191,Pontiac,G6,2009,219.0,6.0,AUTOMATIC,Sedan,26,17
5390,Volkswagen,Golf GTI,2016,220.0,4.0,AUTOMATED_MANUAL,2dr Hatchback,33,25
860,Saab,9-5,2009,260.0,4.0,AUTOMATIC,Wagon,27,17


In [221]:
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)

In [222]:
X_train

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
3972,Mitsubishi,Endeavor,2011,225.0,6.0,AUTOMATIC,4dr SUV,19,15
1997,Kia,Borrego,2009,276.0,6.0,AUTOMATIC,4dr SUV,21,17
5216,Lamborghini,Gallardo,2012,570.0,10.0,MANUAL,Convertible,20,12
2805,Chevrolet,Colorado,2016,200.0,4.0,AUTOMATIC,Crew Cab Pickup,27,20
11369,Pontiac,Vibe,2009,158.0,4.0,AUTOMATIC,4dr Hatchback,26,20
...,...,...,...,...,...,...,...,...,...
9232,Toyota,Sienna,2016,266.0,6.0,AUTOMATIC,Passenger Minivan,25,18
5710,Chevrolet,HHR,2009,260.0,4.0,MANUAL,Wagon,29,21
11306,Hyundai,Veracruz,2012,260.0,6.0,AUTOMATIC,4dr SUV,22,17
4414,Mitsubishi,Expo,1993,136.0,4.0,MANUAL,2dr Hatchback,26,19


In [224]:
X_train_num = X_train[numerical_features].values
X_val_num = X_val[numerical_features].values

scaler = StandardScaler()

X_train_num = scaler.fit_transform(X_train_num)
X_val_num = scaler.transform(X_val_num)

In [225]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [226]:
X_train_cat = ohe.fit_transform(X_train[categorical].values)
X_val_cat = ohe.transform(X_val[categorical].values)

In [227]:
X_train = np.column_stack([X_train_num, X_train_cat])
X_val = np.column_stack([X_val_num, X_val_cat])

In [230]:
# Fit the Ridge regression model on the training data with a solver 'sag'. Set the seed to 42.
alpha = [0, 0.01, 0.1, 1, 10]
for a in alpha:
    lr = Ridge(solver="sag", random_state=42, alpha=a, max_iter=5000)
    lr.fit(X_train, y_train)
    # predict
    pred = lr.predict(X_val)
    print(a, round(mean_squared_error(pred, y_val, squared=False), 3))
# This model also has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10].

# Round your RMSE scores to 3 decimal digits.

0 0.218
0.01 0.218
0.1 0.218
1 0.231
10 0.321


`0` is better