In [133]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
from sklearn.metrics import mutual_info_score


import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [134]:
df = pd.read_csv('data.csv')

In [135]:
len(df)

11914

In [136]:
selected_columns=[
'Make','Model','Year','Engine HP','Engine Cylinders','Transmission Type','Vehicle Style','highway MPG','city mpg','MSRP'
]


In [137]:
df_sel=df[selected_columns]

In [138]:
df_sel

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [139]:
data=df_sel
data.columns = data.columns.str.replace(' ', '_').str.lower()

In [140]:
data.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [141]:
data=data.fillna(0)

In [142]:
data.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'msrp'],
      dtype='object')

In [143]:
data.rename

<bound method DataFrame.rename of           make       model  year  engine_hp  engine_cylinders  \
0          BMW  1 Series M  2011      335.0               6.0   
1          BMW    1 Series  2011      300.0               6.0   
2          BMW    1 Series  2011      300.0               6.0   
3          BMW    1 Series  2011      230.0               6.0   
4          BMW    1 Series  2011      230.0               6.0   
...        ...         ...   ...        ...               ...   
11909    Acura         ZDX  2012      300.0               6.0   
11910    Acura         ZDX  2012      300.0               6.0   
11911    Acura         ZDX  2012      300.0               6.0   
11912    Acura         ZDX  2013      300.0               6.0   
11913  Lincoln      Zephyr  2006      221.0               6.0   

      transmission_type  vehicle_style  highway_mpg  city_mpg   msrp  
0                MANUAL          Coupe           26        19  46135  
1                MANUAL    Convertible     

In [144]:
data

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [145]:
data.rename(columns={'msrp':'price'},inplace=True)

In [146]:
data

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [147]:
data.transmission_type.value_counts()

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

In [148]:
data.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int64
dtype: object

In [149]:
data

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [150]:
data.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'price'],
      dtype='object')

In [151]:
categorical=['make', 'model','transmission_type', 'vehicle_style']
numerical=['year', 'engine_hp', 'engine_cylinders','highway_mpg', 'city_mpg']

In [152]:
data.price

0        46135
1        40650
2        36350
3        29450
4        34500
         ...  
11909    46120
11910    56670
11911    50620
11912    50920
11913    28995
Name: price, Length: 11914, dtype: int64

In [153]:
corr_df=data[numerical].corrwith(data.price).abs().to_frame('correlation')

In [154]:
corr_df.sort_values(by='correlation', ascending=False)

Unnamed: 0,correlation
engine_hp,0.650095
engine_cylinders,0.526274
year,0.22759
highway_mpg,0.160043
city_mpg,0.157676


In [155]:
price_mean=data.price.mean()

In [156]:
price_mean

40594.737032063116

In [157]:
data['above_average']=(data.price>price_mean).astype(int)

In [158]:
data

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0
...,...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120,1
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670,1
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620,1
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920,1


In [159]:
del data['price']


In [160]:
data

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,0
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,1
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,1
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,1
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,1


In [161]:
df_train_full, df_test = train_test_split(data, test_size=0.2,random_state=42)

In [162]:
df_train, df_val=train_test_split(df_train_full,test_size=0.25,random_state=42)

In [163]:
df_test

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,above_average
3995,GMC,Envoy XL,2005,275.0,6.0,AUTOMATIC,4dr SUV,18,13,0
7474,Volkswagen,Passat,2016,170.0,4.0,AUTOMATIC,Sedan,38,25,0
7300,Honda,Odyssey,2016,248.0,6.0,AUTOMATIC,Passenger Minivan,28,19,0
3148,Chevrolet,Cruze,2015,138.0,4.0,MANUAL,Sedan,36,25,0
747,Volvo,740,1991,162.0,4.0,AUTOMATIC,Sedan,20,17,0
...,...,...,...,...,...,...,...,...,...,...
267,Nissan,350Z,2007,306.0,6.0,MANUAL,Convertible,24,17,0
4320,Ford,Expedition,2016,365.0,6.0,AUTOMATIC,4dr SUV,20,15,1
5799,Acura,ILX,2015,150.0,4.0,AUTOMATIC,Sedan,35,24,0
6080,Volkswagen,Jetta SportWagen,2014,170.0,5.0,MANUAL,Wagon,33,23,0


In [164]:
y_train_full=df_train_full.above_average.values
y_train=df_train.above_average.values
y_val=df_val.above_average.values
y_test=df_test.above_average.values


In [165]:
del df_train_full['above_average']
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [166]:
def calculate_mi(series):
    return mutual_info_score(series,y_train)

df_mi=df_train[categorical].apply(calculate_mi)
df_mi=df_mi.sort_values(ascending=False)
df_mi.to_frame('MI')

Unnamed: 0,MI
model,0.462344
make,0.239769
vehicle_style,0.084143
transmission_type,0.020958


In [167]:
data.make.value_counts()

make
Chevrolet        1123
Ford              881
Volkswagen        809
Toyota            746
Dodge             626
Nissan            558
GMC               515
Honda             449
Mazda             423
Cadillac          397
Mercedes-Benz     353
Suzuki            351
BMW               334
Infiniti          330
Audi              328
Hyundai           303
Volvo             281
Subaru            256
Acura             252
Kia               231
Mitsubishi        213
Lexus             202
Buick             196
Chrysler          187
Pontiac           186
Lincoln           164
Oldsmobile        150
Land Rover        143
Porsche           136
Saab              111
Aston Martin       93
Plymouth           82
Bentley            74
Ferrari            69
FIAT               62
Scion              60
Maserati           58
Lamborghini        52
Rolls-Royce        31
Lotus              29
Tesla              18
HUMMER             17
Maybach            16
Alfa Romeo          5
McLaren             5
Spyke

In [168]:
dict_train=df_train[categorical+numerical].to_dict(orient='records')
dict_val=df_val[categorical+numerical].to_dict(orient='records')

In [169]:
dict_train

[{'make': 'Mitsubishi',
  'model': 'Endeavor',
  'transmission_type': 'AUTOMATIC',
  'vehicle_style': '4dr SUV',
  'year': 2011,
  'engine_hp': 225.0,
  'engine_cylinders': 6.0,
  'highway_mpg': 19,
  'city_mpg': 15},
 {'make': 'Kia',
  'model': 'Borrego',
  'transmission_type': 'AUTOMATIC',
  'vehicle_style': '4dr SUV',
  'year': 2009,
  'engine_hp': 276.0,
  'engine_cylinders': 6.0,
  'highway_mpg': 21,
  'city_mpg': 17},
 {'make': 'Lamborghini',
  'model': 'Gallardo',
  'transmission_type': 'MANUAL',
  'vehicle_style': 'Convertible',
  'year': 2012,
  'engine_hp': 570.0,
  'engine_cylinders': 10.0,
  'highway_mpg': 20,
  'city_mpg': 12},
 {'make': 'Chevrolet',
  'model': 'Colorado',
  'transmission_type': 'AUTOMATIC',
  'vehicle_style': 'Crew Cab Pickup',
  'year': 2016,
  'engine_hp': 200.0,
  'engine_cylinders': 4.0,
  'highway_mpg': 27,
  'city_mpg': 20},
 {'make': 'Pontiac',
  'model': 'Vibe',
  'transmission_type': 'AUTOMATIC',
  'vehicle_style': '4dr Hatchback',
  'year': 2009

In [170]:
dv=DictVectorizer()

In [171]:
dv.fit(dict_train)

In [172]:
dv.get_feature_names_out()

array(['city_mpg', 'engine_cylinders', 'engine_hp', 'highway_mpg',
       'make=Acura', 'make=Alfa Romeo', 'make=Aston Martin', 'make=Audi',
       'make=BMW', 'make=Bentley', 'make=Bugatti', 'make=Buick',
       'make=Cadillac', 'make=Chevrolet', 'make=Chrysler', 'make=Dodge',
       'make=FIAT', 'make=Ferrari', 'make=Ford', 'make=GMC',
       'make=Genesis', 'make=HUMMER', 'make=Honda', 'make=Hyundai',
       'make=Infiniti', 'make=Kia', 'make=Lamborghini', 'make=Land Rover',
       'make=Lexus', 'make=Lincoln', 'make=Lotus', 'make=Maserati',
       'make=Maybach', 'make=Mazda', 'make=McLaren', 'make=Mercedes-Benz',
       'make=Mitsubishi', 'make=Nissan', 'make=Oldsmobile',
       'make=Plymouth', 'make=Pontiac', 'make=Porsche',
       'make=Rolls-Royce', 'make=Saab', 'make=Scion', 'make=Spyker',
       'make=Subaru', 'make=Suzuki', 'make=Tesla', 'make=Toyota',
       'make=Volkswagen', 'make=Volvo', 'model=1 Series', 'model=100',
       'model=124 Spider', 'model=190-Class', 'model

In [173]:
X_train=dv.transform(dict_train)
X_val=dv.transform(dict_val)

In [174]:
X_train.shape

(7148, 943)

In [175]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [176]:
model.fit(X_train,y_train)

In [177]:
display(model.predict(X_val))
display(y_val)

array([0, 1, 0, ..., 0, 1, 1])

array([0, 1, 0, ..., 0, 1, 1])

In [178]:
y_val_pred=model.predict_proba(X_val)[:,1]

In [179]:
y_val_pred

array([8.45426579e-04, 9.96341923e-01, 1.49816720e-04, ...,
       2.62411354e-04, 9.89700971e-01, 9.87620006e-01])

In [180]:
above_pred_in_val=y_val_pred >=0.5

In [181]:
train_acc=(y_val==above_pred_in_val).mean()
train_acc_round=(y_val==above_pred_in_val).mean().round(2)

In [182]:
train_acc

0.9450272765421738

In [183]:
train_acc_round

0.95

In [184]:
all_features=categorical+numerical
all_features

['make',
 'model',
 'transmission_type',
 'vehicle_style',
 'year',
 'engine_hp',
 'engine_cylinders',
 'highway_mpg',
 'city_mpg']

In [185]:
all_features

['make',
 'model',
 'transmission_type',
 'vehicle_style',
 'year',
 'engine_hp',
 'engine_cylinders',
 'highway_mpg',
 'city_mpg']

In [186]:
from sklearn.metrics import accuracy_score
for feature in [None]+all_features:
    working_features=all_features[:]
    if feature is not None:
        working_features.remove(feature)
    dict_train_fe=df_train[working_features].to_dict(orient='records')
    dict_val_fe=df_val[working_features].to_dict(orient='records')
    dv_fe = DictVectorizer(sparse=False)
    dv_fe.fit(dict_train_fe)
    X_train_fe=dv_fe.transform(dict_train_fe)
    X_val_fe=dv_fe.transform(dict_val_fe)
    model_fe = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model_fe.fit(X_train_fe, y_train)
    # y_val_pred_fe=model_fe.predict_proba(X_val_fe)[:,1]
    # above_pred_in_val_fe=y_val_pred_fe >=0.5
    # fe_accuracy=(y_val==above_pred_in_val_fe).mean()
    y_val_pred_fe=model_fe.predict(X_val_fe)
    fe_accuracy=accuracy_score(y_val,y_val_pred_fe)
    print(f'EXCLUDING {str(feature).rjust(20)}: \t accuracy {fe_accuracy}\tdiff {abs(train_acc-fe_accuracy)}')

EXCLUDING                 None: 	 accuracy 0.9450272765421738	diff 0.0
EXCLUDING                 make: 	 accuracy 0.9467058329836341	diff 0.0016785564414603105
EXCLUDING                model: 	 accuracy 0.9240453210239195	diff 0.020981955518254325
EXCLUDING    transmission_type: 	 accuracy 0.9450272765421738	diff 0.0
EXCLUDING        vehicle_style: 	 accuracy 0.9324381032312211	diff 0.012589173310952662
EXCLUDING                 year: 	 accuracy 0.9479647503147294	diff 0.002937473772555599
EXCLUDING            engine_hp: 	 accuracy 0.9278220730172052	diff 0.01720520352496857
EXCLUDING     engine_cylinders: 	 accuracy 0.9458665547629039	diff 0.0008392782207301552
EXCLUDING          highway_mpg: 	 accuracy 0.9467058329836341	diff 0.0016785564414603105
EXCLUDING             city_mpg: 	 accuracy 0.9324381032312211	diff 0.012589173310952662


In [187]:
data

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,0
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,1
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,1
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,1
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,1


In [188]:
df_sel

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [189]:
data=df_sel.rename(columns={'msrp':'price'},inplace=False)

In [190]:
data

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [191]:
data.price=np.log1p(data.price)

In [192]:
data

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,10.739349
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,10.612779
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,10.500977
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,10.290483
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,10.448744
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,10.739024
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,10.945018
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,10.832122
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,10.838031


In [193]:
data=data.fillna(0)
df_train_full, df_test = train_test_split(data, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


y_train_full = df_train_full.price.values
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

del df_train_full['price']
del df_train['price']
del df_val['price']
del df_test['price']




In [194]:
df_train_full.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
dtype: int64

In [195]:
y_train.shape, y_val.shape


((7148,), (2383,))

In [196]:
dict_train = df_train.to_dict(orient='records')
dict_val = df_val.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(dict_train)
X_train = dv.transform(dict_train)
X_val = dv.transform(dict_val)



In [199]:
!pip install pqdm

Collecting pqdm
  Downloading pqdm-0.2.0-py2.py3-none-any.whl (6.8 kB)
Collecting bounded-pool-executor
  Downloading bounded_pool_executor-0.0.3-py3-none-any.whl (3.4 kB)
Installing collected packages: bounded-pool-executor, pqdm
Successfully installed bounded-pool-executor-0.0.3 pqdm-0.2.0


You should consider upgrading via the 'D:\code-learnbox\zoomcamp\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [201]:
from tqdm.auto import tqdm
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
alpha_valuse=[0, 0.01, 0.1, 1, 10]
for alpha in tqdm(alpha_valuse):
    model_rig=Ridge(alpha=alpha,solver='sag',random_state=42, max_iter=1000)
    model_rig.fit(X_train,y_train)
    y_pred=model_rig.predict(X_val)

    rmse=mean_squared_error(y_val,y_pred, squared=False)
    print(f'Alpha={alpha}, rmse={rmse}')




  0%|          | 0/5 [00:00<?, ?it/s]



Alpha=0, rmse=0.48679431324238753




Alpha=0.01, rmse=0.4867945519275277




Alpha=0.1, rmse=0.4867967000189975




Alpha=1, rmse=0.48681817454327286
Alpha=10, rmse=0.48703228329751275




In [202]:
model_rig=Ridge(alpha=0,solver='sag',random_state=42)
model_rig.fit(X_train,y_train)
y_pred=model_rig.predict(X_val)
rmse=mean_squared_error(y_val,y_pred, squared=False)
print(f'Alpha={0}, rmse={rmse}')

Alpha=0, rmse=0.48679431324238753




In [132]:
display(np.expm1(y_pred).round())
display(np.expm1(y_val))

array([10., 11., 10., ..., 10., 12., 11.])

array([10.26381581, 11.00544424,  9.90802723, ...,  9.99747868,
       11.72558222, 10.87749987])