In [596]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score

In [597]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


#### Features to be used

In [598]:
features = ['Make',
'Model',
'Year',
'Engine HP',
'Engine Cylinders',
'Transmission Type',
'Vehicle Style',
'highway MPG',
'city mpg',
'MSRP']

features

['Make',
 'Model',
 'Year',
 'Engine HP',
 'Engine Cylinders',
 'Transmission Type',
 'Vehicle Style',
 'highway MPG',
 'city mpg',
 'MSRP']

#### Data Preparation:
- Select only features from above and transform thier names as follows
    - data.columns = data.columns.str.replace(' ', '_').str.lower()
- Fill in the missing values of the selected features with 0
- Rename MSRP variable to price

In [599]:
df_new = df[features]
df_new.head()

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [600]:
# Fill spaces with underscore and change to lowercase only
df_new.columns = df_new.columns.str.replace(' ', '_').str.lower()

In [601]:
# Check missing values
df_new.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [602]:
# Fill and check missing values
df_new = df_new.fillna(0)
df_new.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
msrp                 0
dtype: int64

In [603]:
# rename msrp
df_new.rename(columns={'msrp':'price'}, inplace=True)

## Question 1: 
- What is the most frequent observation (mode) for the column 'transmission_type'?

In [604]:
df_new.transmission_type.mode()

0    AUTOMATIC
Name: transmission_type, dtype: object

In [605]:
df_new.transmission_type.describe()

count         11914
unique            5
top       AUTOMATIC
freq           8266
Name: transmission_type, dtype: object

## Question 2:
- Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

In [606]:
df_new.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int64
dtype: object

In [607]:
# Select numerical types
numerical = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

In [608]:
# Correlation between price and numerical variables
df_new[numerical].corrwith(df_new.price).to_frame('correlation')

Unnamed: 0,correlation
year,0.22759
engine_hp,0.650095
engine_cylinders,0.526274
highway_mpg,-0.160043
city_mpg,-0.157676


In [609]:
# Correlation between numerical variables
for i in numerical:
    print("Correlation between Numerical values and {0}".format(i))
    print(df_new[numerical].corrwith(df_new[i]).to_frame('correlation'))
    print()

Correlation between Numerical values and year
                  correlation
year                 1.000000
engine_hp            0.338714
engine_cylinders    -0.040708
highway_mpg          0.258240
city_mpg             0.198171

Correlation between Numerical values and engine_hp
                  correlation
year                 0.338714
engine_hp            1.000000
engine_cylinders     0.774851
highway_mpg         -0.415707
city_mpg            -0.424918

Correlation between Numerical values and engine_cylinders
                  correlation
year                -0.040708
engine_hp            0.774851
engine_cylinders     1.000000
highway_mpg         -0.614541
city_mpg            -0.587306

Correlation between Numerical values and highway_mpg
                  correlation
year                 0.258240
engine_hp           -0.415707
engine_cylinders    -0.614541
highway_mpg          1.000000
city_mpg             0.886829

Correlation between Numerical values and city_mpg
                  

#### Make price binary using
- Now we need to turn the price variable from numeric into a binary format.
- Let's create a variable above_average which is 1 if the price is above its mean value and 0 otherwise.

In [610]:
mean_price = df_new.price.mean()

In [611]:
df_new['above_average'] = (df_new.price > mean_price).astype(int)
df_new.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0


#### Split the data
- Split your data in train/val/test sets with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
- Make sure that the target value (price) is not in your dataframe.

In [612]:
# using 'from sklearn.model_selection import train_test_split'
df_train_full, df_test = train_test_split(df_new, test_size=0.2, random_state=42) 

In [613]:
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [614]:
len(df_train), len(df_val), len(df_test)

(7148, 2383, 2383)

In [615]:
df_new.shape

(11914, 11)

In [616]:
# shuffle indices
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [617]:
# Set our target variables (price/above_average in this case?)
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

In [618]:
del df_train_full['price']
del df_train['price']
del df_test['price']

## Question 3
- Calculate the mutual information score between above_average and other categorical variables in our dataset. Use the training set only.
- Round the scores to 2 decimals using round(score, 2).

In [619]:
df_train_full.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'above_average'],
      dtype='object')

In [620]:
df_train_full.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
above_average          int32
dtype: object

In [621]:
categorical = ['make', 'model', 'transmission_type', 'vehicle_style']

In [622]:
# using 'from sklearn.metrics import mutual_info_score'
mutual_info_score(df_train_full.above_average, df_train_full.make)

0.2387236479073192

In [623]:
# Check mutual information score with the other features
for feature in categorical:
    print(feature, mutual_info_score(df_train_full.above_average, df_train_full[feature]))

make 0.2387236479073192
model 0.46099440756035703
transmission_type 0.020883575914971142
vehicle_style 0.08339022741593435


##### Alternate method for viewing the MI as a dataframe

In [624]:
# Apply mutual info comparison to every other variable
def calculate_mi(series):
    return mutual_info_score(series, df_train_full.above_average)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')


df_mi

Unnamed: 0,MI
model,0.460994
make,0.238724
vehicle_style,0.08339
transmission_type,0.020884


## Question 4:
- Now let's train a logistic regression.
- Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
- Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

- What accuracy did you get?

In [625]:
from sklearn.feature_extraction import DictVectorizer

In [626]:
df_train.nunique()

make                  48
model                869
year                  28
engine_hp            339
engine_cylinders       9
transmission_type      5
vehicle_style         16
highway_mpg           55
city_mpg              62
above_average          2
dtype: int64

Our feautures have different more than 2 options, eg we have 869 different models, how do we then encode this into binary?

In [627]:
# Turn features into dictionary in prep for One Hot Encoding
train_dict = df_train[categorical + numerical].to_dict(orient='records')
train_dict[0]

{'make': 'Mitsubishi',
 'model': 'Endeavor',
 'transmission_type': 'AUTOMATIC',
 'vehicle_style': '4dr SUV',
 'year': 2011,
 'engine_hp': 225.0,
 'engine_cylinders': 6.0,
 'highway_mpg': 19,
 'city_mpg': 15}

In [628]:
# Fit the model on training dataset
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [629]:
dv.get_feature_names_out()

array(['city_mpg', 'engine_cylinders', 'engine_hp', 'highway_mpg',
       'make=Acura', 'make=Alfa Romeo', 'make=Aston Martin', 'make=Audi',
       'make=BMW', 'make=Bentley', 'make=Bugatti', 'make=Buick',
       'make=Cadillac', 'make=Chevrolet', 'make=Chrysler', 'make=Dodge',
       'make=FIAT', 'make=Ferrari', 'make=Ford', 'make=GMC',
       'make=Genesis', 'make=HUMMER', 'make=Honda', 'make=Hyundai',
       'make=Infiniti', 'make=Kia', 'make=Lamborghini', 'make=Land Rover',
       'make=Lexus', 'make=Lincoln', 'make=Lotus', 'make=Maserati',
       'make=Maybach', 'make=Mazda', 'make=McLaren', 'make=Mercedes-Benz',
       'make=Mitsubishi', 'make=Nissan', 'make=Oldsmobile',
       'make=Plymouth', 'make=Pontiac', 'make=Porsche',
       'make=Rolls-Royce', 'make=Saab', 'make=Scion', 'make=Spyker',
       'make=Subaru', 'make=Suzuki', 'make=Tesla', 'make=Toyota',
       'make=Volkswagen', 'make=Volvo', 'model=1 Series', 'model=100',
       'model=124 Spider', 'model=190-Class', 'model

In [630]:
# Transform features into a matrix
X_train = dv.transform(train_dict)
X_train

array([[1.500e+01, 6.000e+00, 2.250e+02, ..., 0.000e+00, 0.000e+00,
        2.011e+03],
       [1.700e+01, 6.000e+00, 2.760e+02, ..., 0.000e+00, 0.000e+00,
        2.009e+03],
       [1.200e+01, 1.000e+01, 5.700e+02, ..., 0.000e+00, 0.000e+00,
        2.012e+03],
       ...,
       [1.700e+01, 6.000e+00, 2.600e+02, ..., 0.000e+00, 0.000e+00,
        2.012e+03],
       [1.900e+01, 4.000e+00, 1.360e+02, ..., 0.000e+00, 0.000e+00,
        1.993e+03],
       [1.700e+01, 6.000e+00, 3.650e+02, ..., 1.000e+00, 0.000e+00,
        2.015e+03]])

In [631]:
X_train[:5]

array([[  15.,    6.,  225., ...,    0.,    0., 2011.],
       [  17.,    6.,  276., ...,    0.,    0., 2009.],
       [  12.,   10.,  570., ...,    0.,    0., 2012.],
       [  20.,    4.,  200., ...,    0.,    0., 2016.],
       [  20.,    4.,  158., ...,    0.,    0., 2009.]])

#### Logistic Regression

In [632]:
from sklearn.linear_model import LogisticRegression

In [633]:
y_train

array([0, 0, 1, ..., 0, 0, 0])

In [634]:
# Train a model
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [635]:
model.intercept_

array([-0.3601927])

In [636]:
model.coef_[0]

array([ 7.75600465e-02, -1.16768073e-01,  3.65897177e-02, -2.68580486e-03,
        1.35354961e+00,  1.80042557e+00,  5.95702725e-01,  2.91174358e+00,
        2.39384223e+00,  1.78618275e-01,  3.30036316e-08, -4.91737424e-01,
        2.35937594e+00, -1.50178449e+00, -1.41669918e+00, -3.85113146e+00,
       -5.12531955e-01,  3.74787716e-01, -1.91311202e+00, -9.79006014e-01,
        6.20825014e-01, -2.43317314e-01, -1.31269054e+00, -2.69375207e+00,
        3.23536986e-01, -1.53257475e+00,  1.06102659e-02,  1.95644428e+00,
        1.32569107e+00,  1.20210701e+00,  4.03811516e+00,  1.05531262e+00,
        4.81391092e-03, -1.66324306e+00,  4.61690596e-04,  9.42732488e-01,
       -2.01345063e+00, -9.95222760e-01, -1.43812186e+00, -3.49639392e-01,
       -3.19551108e+00,  2.02131089e+00,  9.79245249e-01,  9.14695021e-01,
       -2.35179864e-01,  3.31607539e-01, -2.72759498e+00, -1.92603564e+00,
        3.25063673e+00, -7.66511172e-01, -7.35127369e-01,  1.18759074e+00,
       -1.34785088e+00, -

In [637]:
# hard prediction
model.predict(X_train)

array([0, 0, 1, ..., 0, 0, 0])

In [638]:
# Soft Prediction
model.predict_proba(X_train)

array([[9.89321040e-01, 1.06789599e-02],
       [9.57872508e-01, 4.21274917e-02],
       [6.47335952e-05, 9.99935266e-01],
       ...,
       [9.81311814e-01, 1.86881860e-02],
       [9.99976106e-01, 2.38942282e-05],
       [6.38045123e-01, 3.61954877e-01]])

In [639]:
# Use trained model on validation dataset
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)
X_val

array([[2.300e+01, 4.000e+00, 2.100e+02, ..., 0.000e+00, 0.000e+00,
        2.015e+03],
       [1.700e+01, 6.000e+00, 3.540e+02, ..., 0.000e+00, 0.000e+00,
        2.015e+03],
       [2.200e+01, 4.000e+00, 1.400e+02, ..., 1.000e+00, 0.000e+00,
        2.005e+03],
       ...,
       [1.200e+01, 6.000e+00, 1.900e+02, ..., 0.000e+00, 0.000e+00,
        2.003e+03],
       [1.400e+01, 8.000e+00, 4.300e+02, ..., 0.000e+00, 0.000e+00,
        2.015e+03],
       [1.800e+01, 6.000e+00, 3.210e+02, ..., 1.000e+00, 0.000e+00,
        2.015e+03]])

In [640]:
y_pred = model.predict(X_val)
y_pred

array([0, 1, 0, ..., 0, 1, 1])

In [641]:
y_val

array([0, 1, 0, ..., 0, 1, 1])

In [642]:
(y_val == y_pred)

array([ True,  True,  True, ...,  True,  True,  True])

In [643]:
accuracy = (y_val == y_pred).mean().round(2)
accuracy

0.95

0.95 accuracy!

## Question 5:
- Let's find the least useful feature using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

In [644]:
features = categorical + numerical
features

['make',
 'model',
 'transmission_type',
 'vehicle_style',
 'year',
 'engine_hp',
 'engine_cylinders',
 'highway_mpg',
 'city_mpg']

In [645]:
new_feat = []
for i in features:
    new_feat.append(i)
    
    train_set = df_train[new_feat].to_dict(orient='records')

    #   Fit the model on training dataset
    dv = DictVectorizer(sparse=False)
    dv.fit(train_set)
    
    X_train = dv.transform(train_set)
    
#   Train model with features
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    model.predict(X_train)
    
#     print('with features: {0}\n {1}\n'.format(new_feat, model.predict(X_train)[:15]))

#   Use trained model on validation dataset
    val_set = df_val[new_feat].to_dict(orient='records')
    X_val = dv.transform(val_set)
    
    predict = model.predict(X_val)
    
    acc = ((y_val == predict).mean().round(2))
    
    print('with features: {0}\n  accuracy is: {1}\n Difference: {2}\n'.format(new_feat, acc, (accuracy-acc).round(2)))

with features: ['make']
  accuracy is: 0.86
 Difference: 0.09

with features: ['make', 'model']
  accuracy is: 0.92
 Difference: 0.03

with features: ['make', 'model', 'transmission_type']
  accuracy is: 0.92
 Difference: 0.03

with features: ['make', 'model', 'transmission_type', 'vehicle_style']
  accuracy is: 0.93
 Difference: 0.02

with features: ['make', 'model', 'transmission_type', 'vehicle_style', 'year']
  accuracy is: 0.9
 Difference: 0.05

with features: ['make', 'model', 'transmission_type', 'vehicle_style', 'year', 'engine_hp']
  accuracy is: 0.94
 Difference: 0.01

with features: ['make', 'model', 'transmission_type', 'vehicle_style', 'year', 'engine_hp', 'engine_cylinders']
  accuracy is: 0.92
 Difference: 0.03

with features: ['make', 'model', 'transmission_type', 'vehicle_style', 'year', 'engine_hp', 'engine_cylinders', 'highway_mpg']
  accuracy is: 0.93
 Difference: 0.02

with features: ['make', 'model', 'transmission_type', 'vehicle_style', 'year', 'engine_hp', 'engi

In [650]:
def log_reg(feats):
    train_set = df_train[feats].to_dict(orient='records')

    #   Fit the model on training dataset
    dv = DictVectorizer(sparse=False)
    dv.fit(train_set)
    
    X_train = dv.transform(train_set)
    
#   Train model with features
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    model.predict(X_train)   

#   Use trained model on validation dataset
    val_set = df_val[feats].to_dict(orient='records')
    X_val = dv.transform(val_set)
    
    predict = model.predict(X_val)
    
    acc = ((y_val == predict).mean().round(6))
    
    return(acc)

In [651]:
features = categorical + numerical

In [652]:
original_acc = log_reg(features)
print(original_acc)

0.947125


In [654]:
for i in range(9):
    features = categorical + numerical
    print('Accuracy with "{0}" out:'.format(features[i]))
    del features[i]
    acc = log_reg(features)
    print(acc)
    print('Difference: {0}\n'.format((original_acc - acc).round(6)))
#     print(features)
#     print(len(features))

Accuracy with "make" out:
0.947545
Difference: -0.00042

Accuracy with "model" out:
0.920269
Difference: 0.026856

Accuracy with "transmission_type" out:
0.945867
Difference: 0.001258

Accuracy with "vehicle_style" out:
0.932018
Difference: 0.015107

Accuracy with "year" out:
0.948384
Difference: -0.001259

Accuracy with "engine_hp" out:
0.924885
Difference: 0.02224

Accuracy with "engine_cylinders" out:
0.947125
Difference: 0.0

Accuracy with "highway_mpg" out:
0.944188
Difference: 0.002937

Accuracy with "city_mpg" out:
0.932438
Difference: 0.014687



## Question 6:
- For this question, we'll see how to use a linear regression model from Scikit-Learn.
- We'll need to use the original column price. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model on the training data with a solver 'sag'. Set the seed to 42.
- This model also has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10].
- Round your RMSE scores to 3 decimal digits.

Which of these alphas leads to the best RMSE on the validation set?
- 0
- 0.01
- 0.1
- 1
- 10

In [655]:
df_new.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,0


In [656]:
# Apply log transormation
df_new.price = np.log1p(df_new.price.values)
df_new.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,10.739349,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,10.612779,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,10.500977,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,10.290483,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,10.448744,0


In [657]:
from sklearn.linear_model import Ridge

In [658]:
df_train_full, df_test = train_test_split(df_new, test_size=0.2, random_state=42) 

In [659]:
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [660]:
# shuffle indices
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [661]:
# Set our target variables (price in this case)
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

In [662]:
new_feats = df_new.columns

In [663]:
train_set = df_train[new_feats].to_dict(orient='records')

#   Fit the model on training dataset
dv = DictVectorizer(sparse=False)
dv.fit(train_set)
    
X_train = dv.transform(train_set)
X_train

array([[0.000e+00, 1.500e+01, 6.000e+00, ..., 0.000e+00, 0.000e+00,
        2.011e+03],
       [0.000e+00, 1.700e+01, 6.000e+00, ..., 0.000e+00, 0.000e+00,
        2.009e+03],
       [1.000e+00, 1.200e+01, 1.000e+01, ..., 0.000e+00, 0.000e+00,
        2.012e+03],
       ...,
       [0.000e+00, 1.700e+01, 6.000e+00, ..., 0.000e+00, 0.000e+00,
        2.012e+03],
       [0.000e+00, 1.900e+01, 4.000e+00, ..., 0.000e+00, 0.000e+00,
        1.993e+03],
       [0.000e+00, 1.700e+01, 6.000e+00, ..., 1.000e+00, 0.000e+00,
        2.015e+03]])

In [664]:
y_train

array([10.42228135, 10.17526888, 12.42118806, ..., 10.25224121,
        7.60140233, 10.60214453])

In [665]:
# define rmse functin
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [666]:
alphas = [0, 0.01, 0.1, 1, 10]

In [667]:
model = Ridge(solver='sag', alpha = 0, random_state=42)
model.fit(X_train, y_train)

model.predict(X_train)   

#   Use trained model on validation dataset
val_set = df_val[new_feats].to_dict(orient='records')
X_val = dv.transform(val_set)
    
y_pred = model.predict(X_val)
y_pred 



array([10.24864769, 11.02132209,  9.87811981, ...,  9.9169917 ,
       11.7683166 , 10.89407268])

In [668]:
rmse(y_val, y_pred)

0.03378039425719436

In [669]:
# Iterate through the various alphas
for i in alphas:
    model = Ridge(solver='sag', alpha = i, random_state=42)
    model.fit(X_train, y_train)

    model.predict(X_train)   

    #   Use trained model on validation dataset
    val_set = df_val[new_feats].to_dict(orient='records')
    X_val = dv.transform(val_set)

    y_pred = model.predict(X_val)
    
    print('RMSE with alpha {0}:\n {1}'.format(i, rmse(y_val, y_pred)))



RMSE with alpha 0:
 0.03378039425719436




RMSE with alpha 0.01:
 0.033781720212342534




RMSE with alpha 0.1:
 0.033793657608226814




RMSE with alpha 1:
 0.03391339954053769
RMSE with alpha 10:
 0.03514574950716563


