In [87]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
!curl -LJO https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
 33 1440k   33  489k    0     0   524k      0  0:00:02 --:--:--  0:00:02  524k
100 1440k  100 1440k    0     0   772k      0  0:00:01  0:00:01 --:--:--  772k


In [88]:
df = pd.read_csv('data.csv')

In [5]:
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11914 non-null  object 
 1   Model              11914 non-null  object 
 2   Year               11914 non-null  int64  
 3   Engine Fuel Type   11911 non-null  object 
 4   Engine HP          11845 non-null  float64
 5   Engine Cylinders   11884 non-null  float64
 6   Transmission Type  11914 non-null  object 
 7   Driven_Wheels      11914 non-null  object 
 8   Number of Doors    11908 non-null  float64
 9   Market Category    8172 non-null   object 
 10  Vehicle Size       11914 non-null  object 
 11  Vehicle Style      11914 non-null  object 
 12  highway MPG        11914 non-null  int64  
 13  city mpg           11914 non-null  int64  
 14  Popularity         11914 non-null  int64  
 15  MSRP               11914 non-null  int64  
dtypes: float64(3), int64(5

In [89]:
features = ['Make','Model','Year','Engine HP','Engine Cylinders','Transmission Type','Vehicle Style','highway MPG','city mpg','MSRP']

In [90]:
df = df[features]
df

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [91]:
for col in list(df.dtypes[df.dtypes == 'object'].index):
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [92]:
for col in list(df.dtypes[df.dtypes != 'object'].index):
    df[col] = df[col].fillna(0)

In [93]:
df.rename(columns={'MSRP':'price'}, inplace=True)
df

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,price
0,bmw,1_series_m,2011,335.0,6.0,manual,coupe,26,19,46135
1,bmw,1_series,2011,300.0,6.0,manual,convertible,28,19,40650
2,bmw,1_series,2011,300.0,6.0,manual,coupe,28,20,36350
3,bmw,1_series,2011,230.0,6.0,manual,coupe,28,18,29450
4,bmw,1_series,2011,230.0,6.0,manual,convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,46120
11910,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,56670
11911,acura,zdx,2012,300.0,6.0,automatic,4dr_hatchback,23,16,50620
11912,acura,zdx,2013,300.0,6.0,automatic,4dr_hatchback,23,16,50920


In [94]:
### Question 1
df['Transmission Type'].value_counts(normalize=True)

Transmission Type
automatic           0.693806
manual              0.246349
automated_manual    0.052543
direct_drive        0.005708
unknown             0.001595
Name: proportion, dtype: float64

In [95]:
df[['Year','Engine HP','Engine Cylinders','highway MPG','city mpg','price']].corr()

Unnamed: 0,Year,Engine HP,Engine Cylinders,highway MPG,city mpg,price
Year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
Engine HP,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
Engine Cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway MPG,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


In [96]:
### Question 2: highway_mpg and city_mpg have the biggest correlation

In [97]:
price_mean = df['price'].mean()
price_mean

40594.737032063116

In [98]:
df['above_average'] = df['price'].apply(lambda x: 1 if x>price_mean else 0)

In [99]:
df['above_average'].value_counts(normalize=True) 

above_average
0    0.725617
1    0.274383
Name: proportion, dtype: float64

In [143]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state=42)
len(df_full_train), len(df_test)

(9531, 2383)

In [144]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state=42)
len(df_train), len(df_val), len(df_test)

(7148, 2383, 2383)

In [145]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [146]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

In [147]:
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']
del df_train['price']
del df_val['price']
del df_test['price']


In [148]:
df_train

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg
0,mitsubishi,endeavor,2011,225.0,6.0,automatic,4dr_suv,19,15
1,kia,borrego,2009,276.0,6.0,automatic,4dr_suv,21,17
2,lamborghini,gallardo,2012,570.0,10.0,manual,convertible,20,12
3,chevrolet,colorado,2016,200.0,4.0,automatic,crew_cab_pickup,27,20
4,pontiac,vibe,2009,158.0,4.0,automatic,4dr_hatchback,26,20
...,...,...,...,...,...,...,...,...,...
7143,toyota,sienna,2016,266.0,6.0,automatic,passenger_minivan,25,18
7144,chevrolet,hhr,2009,260.0,4.0,manual,wagon,29,21
7145,hyundai,veracruz,2012,260.0,6.0,automatic,4dr_suv,22,17
7146,mitsubishi,expo,1993,136.0,4.0,manual,2dr_hatchback,26,19


In [149]:
def mutual_info_churn_score(series):
    return mutual_info_score(y_train, series)

In [150]:
### Question 3
df_train[['Make','Model','Transmission Type','Vehicle Style']].apply(mutual_info_churn_score).sort_values(ascending=False).round(2)

Model                0.46
Make                 0.24
Vehicle Style        0.08
Transmission Type    0.02
dtype: float64

## Logistic Regression model

In [151]:
train_dict = df_train.to_dict(orient='records')

In [152]:
dv = DictVectorizer(sparse=False) 

In [153]:
# prepare the training set
X_train = dv.fit_transform(train_dict)
X_train

array([[   6.,  225.,    0., ..., 2011.,   15.,   19.],
       [   6.,  276.,    0., ..., 2009.,   17.,   21.],
       [  10.,  570.,    0., ..., 2012.,   12.,   20.],
       ...,
       [   6.,  260.,    0., ..., 2012.,   17.,   22.],
       [   4.,  136.,    0., ..., 1993.,   19.,   26.],
       [   6.,  365.,    0., ..., 2015.,   17.,   25.]])

In [154]:
# prepare the validation set
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [155]:
### training
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train,y_train)

In [156]:
y_pred = model.predict(X_val)
y_pred

array([0, 1, 0, ..., 0, 1, 1], dtype=int64)

In [161]:
acc = (y_pred==y_val).mean().round(2)
acc

0.94

In [158]:
np.sum(y_pred)

666

In [159]:
np.sum(y_val)

661

## Question 5

In [164]:
features

['Make',
 'Model',
 'Year',
 'Engine HP',
 'Engine Cylinders',
 'Transmission Type',
 'Vehicle Style',
 'highway MPG',
 'city mpg']

In [165]:
def logistic_reg(df_train, df_val,col_exclude):
    
    df_train = df_train.copy()
    df_train.drop(col_exclude, axis=1, inplace=True)
    df_val = df_val.copy()
    df_val.drop(col_exclude, axis=1, inplace=True)

    train_dict = df_train.to_dict(orient='records')
    dv = DictVectorizer(sparse=False) 

    X_train = dv.fit_transform(train_dict)

    val_dict = df_val.to_dict(orient='records')
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)

    return (y_pred==y_val).mean()
    

In [170]:
scores = []
for feature in features:
    scores.append(logistic_reg(df_train, df_val,col_exclude=feature)-acc)

In [171]:
print(list(zip(features,scores)))

[('Make', 0.002089802769618232), ('Model', -0.023927822073017135), ('Year', 0.00796475031472943), ('Engine HP', -0.015954678976080494), ('Engine Cylinders', 0.005446915652538853), ('Transmission Type', 0.004607637431808698), ('Vehicle Style', 0.0008308854385229436), ('highway MPG', 0.002509441879983254), ('city mpg', -0.007561896768778831)]


In [138]:
## Question 5: City MPG seems to affect less the accuracy

## Question 6

In [186]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [174]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state=42)
len(df_full_train), len(df_test)

(9531, 2383)

In [175]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state=42)
len(df_train), len(df_val), len(df_test)

(7148, 2383, 2383)

In [176]:
y_train = df_train.price.values
y_val = df_val.price.values

In [177]:
del df_train['price']
del df_val['price']

In [180]:
y_train = np.log1p(y_train)
y_val = np.log1p(y_val)

In [183]:
train_dict = df_train.to_dict(orient='records')
dv = DictVectorizer() 

X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [196]:
alphas = [0, 0.01, 0.1, 1, 10]
rmses = []

In [197]:
for alpha in alphas :
    model = Ridge(alpha=alpha, solver='sag', random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(preds,y_val))
    rmse = (round(rmse,3))
    rmses.append(rmse)


print(list(zip(alphas,rmses)))

[(0, 7.764), (0.01, 7.764), (0.1, 7.764), (1, 7.764), (10, 7.765)]


In [None]:
### 0 is the answer for question 6