In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import accuracy_score, mean_squared_error
import numpy as np
from math import sqrt

In [33]:
!wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

--2023-10-01 04:40:45--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8001::154, 2606:50c0:8002::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8001::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1475504 (1.4M) [text/plain]
Saving to: ‘data.csv.1’


2023-10-01 04:40:46 (1.51 MB/s) - ‘data.csv.1’ saved [1475504/1475504]



In [34]:
df = pd.read_csv('data.csv')
columns = ['Make',
           'Model',
           'Year',
           'Engine HP',
           'Engine Cylinders',
           'Transmission Type',
           'Vehicle Style',
           'highway MPG',
           'city mpg']
target = df["MSRP"]
df = df[columns]
df.columns = df.columns.str.replace(' ', '_').str.lower()
df = df.fillna(0)
df["price"] = target
df.head()


Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [35]:
numerical = [column for column in df.columns if df[column].dtype in ('int64', 'float64') and column != "price"]
print(numerical)
categorical = [column for column in df.columns if df[column].dtype == 'O' and column != "price"]
print(categorical)

['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']
['make', 'model', 'transmission_type', 'vehicle_style']


### Question 1

In [36]:
df["transmission_type"].value_counts()

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

### Question 2

In [37]:
df[numerical].corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
year,1.0,0.338714,-0.040708,0.25824,0.198171
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0


highway mpg and city mpg have highest correlation

In [38]:
df["above_average"] = df["price"].apply(lambda x: 1 if x > df["price"].mean() else 0)
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0


In [39]:
# split your data in train/val/test sets with 60%/20%/20% distribution

p1 = 0.6  # Training
p2 = 0.2  # Validation
p3 = 0.2  # Test

df_full_train, df_test = train_test_split(df, test_size=p3, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=p2 / (1 - p3), random_state=42)

print(len(df_train), len(df_val), len(df_test))

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train["above_average"].values
y_val = df_val["above_average"].values
y_test = df_test["above_average"].values

7148 2383 2383


### Question 3

In [40]:
def mutual_info_above_average_score(series):
    return mutual_info_score(series, y_train)

mi = df_train[categorical].apply(mutual_info_above_average_score)
mi.sort_values(ascending=False)

model                0.462344
make                 0.239769
vehicle_style        0.084143
transmission_type    0.020958
dtype: float64

### Question 4

In [41]:
dv = DictVectorizer()

train_dict = df_train[numerical + categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

val_dict = df_val[numerical + categorical].to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
accuracy_rounded = round(accuracy, 2)
accuracy_rounded

0.95

### Question 5

In [42]:
feature_and_accuracy_diff = []
features = ["year",
            "engine_hp",
            "transmission_type",
            "city_mpg"]

for feature in features:

    remaining_series = [s for s in features if s != feature]

    dv = DictVectorizer()

    train_dict = df_train[remaining_series].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    val_dict = df_val[remaining_series].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    y_pred = model.predict(X_val)

    remaining_accuracy = accuracy_score(y_val, y_pred)
    accuracy_diff = accuracy_rounded - remaining_accuracy
    feature_and_accuracy_diff.append((feature, round(accuracy_diff, 5)))

feature_and_accuracy_diff

[('year', 0.06456),
 ('engine_hp', 0.20556),
 ('transmission_type', 0.06792),
 ('city_mpg', 0.07337)]

### Question 6

In [43]:
df_train["price_log"] = np.log1p(df_train["price"])
df_val["price_log"] = np.log1p(df_val["price"])
df_test["price_log"] = np.log1p(df_test["price"])

y_train = df_train["price_log"].values
y_val = df_val["price_log"].values
y_test = df_test["price_log"].values

dv = DictVectorizer()

train_dict = df_train[numerical + categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[numerical + categorical].to_dict(orient='records')
X_val = dv.transform(val_dict)

accuracies_alpha = []

for alpha in [0, 0.01, 0.1, 1, 10]:

    model = Ridge(solver='sag', random_state=42, alpha=alpha)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    rmse = sqrt(mean_squared_error(y_val, y_pred))
    rmse_rounded = round(rmse, 3)
    
    accuracies_alpha.append((alpha, rmse_rounded))

accuracies_alpha

[(0, 0.255), (0.01, 0.255), (0.1, 0.255), (1, 0.258), (10, 0.336)]

alpha = 0