## Data understanding

### Import data

In [281]:
import numpy as np 
import pandas as pd
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
import seaborn as sns 

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import r2_score, mean_squared_error

In [282]:
df = pd.read_csv('./New_cars_cleaned.csv')
df.shape

(32316, 56)

In [283]:
df.head(8)

Unnamed: 0,MSRP,EPA Fuel Economy Est - City (MPG),Engine,Drivetrain,Passenger Capacity,Passenger Doors,Base Curb Weight (lbs),Passenger Volume (ft³),Wheelbase (in),"Track Width, Front (in)","Height, Overall (in)","Fuel Tank Capacity, Approx (gal)",SAE Net Torque @ RPM,Fuel System,SAE Net Horsepower @ RPM,Displacement,Trans Description Cont.,Trans Type,Suspension Type - Front,Suspension Type - Rear,Air Bag-Frontal-Driver,Air Bag-Frontal-Passenger,Air Bag-Passenger Switch (On/Off),Air Bag-Side Body-Front,Air Bag-Side Body-Rear,Air Bag-Side Head-Front,Air Bag-Side Head-Rear,Brakes-ABS,Child Safety Rear Door Locks,Daytime Running Lights,Traction Control,Night Vision,Rollover Protection Bars,Fog Lamps,Parking Aid,Tire Pressure Monitor,Back-Up Camera,Stability Control,Basic Miles/km,Basic Years,Corrosion Miles/km,Corrosion Years,Drivetrain Miles/km,Drivetrain Years,Turning Diameter - Curb to Curb (ft),Front Wheel Material,Stabilizer Bar Diameter - Front (in),Roadside Assistance Years,Roadside Assistance Miles/km,Manufacturer,Model year,Category,Front tire width,Front tire aspect ratio,Front tire speed ratings/cons.type,Front tire rim size
0,40600.0,22.0,l4,FWD,5,4,3790.0,104.0,108.3,64.2,65.7,17.1,280.0,DI,272.0,2.0,Automatic,10,MacPherson Strut,Link type,1,1,0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,50000.0,4.0,150000.0,5.0,70000.0,6.0,39.0,Aluminum,,,50000.0,Acura,2019,SUV,235,55.0,H,19.0
1,45500.0,22.0,l4,FWD,5,4,3829.0,104.0,108.3,64.2,65.7,17.1,280.0,DI,272.0,2.0,Automatic,10,MacPherson Strut,Link type,1,1,0,1,0,1,1,1,1,1,1,0,0,1,1,1,1,1,50000.0,4.0,150000.0,5.0,70000.0,6.0,39.0,Aluminum,,,50000.0,Acura,2019,SUV,235,55.0,H,19.0
2,43600.0,22.0,l4,FWD,5,4,3821.0,104.0,108.3,64.2,65.7,17.1,280.0,DI,272.0,2.0,Automatic,10,MacPherson Strut,Link type,1,1,0,1,0,1,1,1,1,1,1,0,0,1,1,1,1,1,50000.0,4.0,150000.0,5.0,70000.0,6.0,39.0,Aluminum,,,50000.0,Acura,2019,SUV,255,45.0,V,20.0
3,37400.0,22.0,l4,FWD,5,4,3783.0,104.0,108.3,64.2,65.7,17.1,280.0,DI,272.0,2.0,Automatic,10,MacPherson Strut,Link type,1,1,0,1,0,1,1,1,1,1,1,0,0,0,0,1,1,1,50000.0,4.0,150000.0,5.0,70000.0,6.0,39.0,Aluminum,,,50000.0,Acura,2019,SUV,235,55.0,H,19.0
4,42600.0,21.0,l4,AWD,5,4,4026.0,104.0,108.3,64.2,65.7,17.1,280.0,DI,272.0,2.0,Automatic,10,MacPherson Strut,Link type,1,1,0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,50000.0,4.0,150000.0,5.0,70000.0,6.0,39.0,Aluminum,,,50000.0,Acura,2019,SUV,235,55.0,H,19.0
5,47500.0,21.0,l4,AWD,5,4,4068.0,104.0,108.3,64.2,65.7,17.1,280.0,DI,272.0,2.0,Automatic,10,MacPherson Strut,Link type,1,1,0,1,0,1,1,1,1,1,1,0,0,1,1,1,1,1,50000.0,4.0,150000.0,5.0,70000.0,6.0,39.0,Aluminum,,,50000.0,Acura,2019,SUV,235,55.0,H,19.0
6,45600.0,21.0,l4,AWD,5,4,4015.0,104.0,108.3,64.2,65.7,17.1,280.0,DI,272.0,2.0,Automatic,10,MacPherson Strut,Link type,1,1,0,1,0,1,1,1,1,1,1,0,0,1,1,1,1,1,50000.0,4.0,150000.0,5.0,70000.0,6.0,39.0,Aluminum,,,50000.0,Acura,2019,SUV,255,45.0,V,20.0
7,37500.0,19.0,V6,AWD,5,4,3902.0,103.5,105.7,63.1,65.0,16.0,252.0,SFI,279.0,3.5,Automatic,6,MacPherson Strut,Link type,1,1,0,1,0,1,1,1,1,1,1,0,0,0,0,1,1,1,50000.0,4.0,150000.0,5.0,70000.0,6.0,38.9,Aluminum,,,50000.0,Acura,2018,SUV,235,60.0,V,18.0


In [284]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32316 entries, 0 to 32315
Data columns (total 56 columns):
MSRP                                    32262 non-null float64
EPA Fuel Economy Est - City (MPG)       27027 non-null float64
Engine                                  30341 non-null object
Drivetrain                              30600 non-null object
Passenger Capacity                      32316 non-null int64
Passenger Doors                         32316 non-null int64
Base Curb Weight (lbs)                  19456 non-null float64
Passenger Volume (ft³)                  16756 non-null float64
Wheelbase (in)                          30301 non-null float64
Track Width, Front (in)                 20130 non-null float64
Height, Overall (in)                    15628 non-null float64
Fuel Tank Capacity, Approx (gal)        15615 non-null float64
SAE Net Torque @ RPM                    30249 non-null float64
Fuel System                             29486 non-null object
SAE Net Horsepow

## Data preparation

### Drop and summirize data

#### Drop Electric engine entries
Remove Electrical cars from data as technically different factors have influence. Also there are proportionally few entries for that type.

In [285]:
print('Number of entries with electric engines: ', df[df['Engine'] == 'Electric'].shape[0])
df = df[df['Engine'] != 'Electric']
df.shape

Number of entries with electric engines:  174


(32142, 56)

#### Drop Front tire width because many values inside are technically wrong/inconsistent 

In [286]:
df = df.drop('Front tire width', axis=1)
df.shape

(32142, 55)

#### Drop columns with missing values count above threshold

In [287]:
def drop_missing_col(df, thres):
    df_ret = df
    for col in df.keys():
        perc_missing = df_ret[col].isna().mean()
        if perc_missing >= thres:
            #print('In col: {} missing {:.4f}'.format(col, perc_missing))
            df_ret = df_ret.drop([col], axis=1)
    return df_ret

In [288]:
# Display percentage of missing values
100*df.isna().mean()

MSRP                                     0.155560
EPA Fuel Economy Est - City (MPG)       15.913758
Engine                                   6.144608
Drivetrain                               5.338809
Passenger Capacity                       0.000000
Passenger Doors                          0.000000
Base Curb Weight (lbs)                  39.938398
Passenger Volume (ft³)                  48.254620
Wheelbase (in)                           6.269056
Track Width, Front (in)                 37.900566
Height, Overall (in)                    51.477817
Fuel Tank Capacity, Approx (gal)        51.440483
SAE Net Torque @ RPM                     6.325058
Fuel System                              8.297555
SAE Net Horsepower @ RPM                 6.172609
Displacement                             6.160164
Trans Description Cont.                  6.306390
Trans Type                               6.153942
Suspension Type - Front                  0.000000
Suspension Type - Rear                   0.000000


In [289]:
# Threshold missing values in column: 0.1
miss_thres_col = 0.1
df = drop_missing_col(df, miss_thres_col)
df.shape

(32142, 25)

#### Drop columns which have constant values

In [290]:
def drop_const_col(df):
    df_ret = df
    for col in df.keys():
        num_unique = df_ret[col].nunique()
        if num_unique <= 1:
            #print('In col: {} num unique {}'.format(col, num_unique))
            df_ret = df_ret.drop([col], axis=1)
    return df_ret

In [291]:
df = drop_const_col(df)
df.shape

(32142, 25)

#### Drop possible duplicates

In [292]:
df = df.drop_duplicates()
df.shape

(28707, 25)

#### Drop rows with missing values

In [293]:
df = df.dropna()
df.shape

(28687, 25)

#### Remap some of the categorical values

In [294]:
# Map same types of engines
repl_dict_engine = {
    "I6" : "l6"
}
df = df.replace(repl_dict_engine)

In [295]:
# Map same types of drivetrains
repl_dict_drtrain = {
    "AWD" : "4WD"
}
df = df.replace(repl_dict_drtrain)

In [296]:
# Map same types of front wheel material
repl_dict_wheel = {
    "Alloy" : "Aluminum"
}
df = df.replace(repl_dict_wheel)

#### Display dataset again

In [297]:
df.head()

Unnamed: 0,MSRP,Passenger Capacity,Passenger Doors,Suspension Type - Front,Suspension Type - Rear,Air Bag-Frontal-Driver,Air Bag-Frontal-Passenger,Air Bag-Passenger Switch (On/Off),Air Bag-Side Body-Front,Air Bag-Side Body-Rear,Air Bag-Side Head-Front,Air Bag-Side Head-Rear,Brakes-ABS,Child Safety Rear Door Locks,Daytime Running Lights,Traction Control,Night Vision,Rollover Protection Bars,Fog Lamps,Parking Aid,Tire Pressure Monitor,Back-Up Camera,Stability Control,Manufacturer,Model year
0,40600.0,5,4,MacPherson Strut,Link type,1,1,0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,Acura,2019
1,45500.0,5,4,MacPherson Strut,Link type,1,1,0,1,0,1,1,1,1,1,1,0,0,1,1,1,1,1,Acura,2019
2,43600.0,5,4,MacPherson Strut,Link type,1,1,0,1,0,1,1,1,1,1,1,0,0,1,1,1,1,1,Acura,2019
3,37400.0,5,4,MacPherson Strut,Link type,1,1,0,1,0,1,1,1,1,1,1,0,0,0,0,1,1,1,Acura,2019
4,42600.0,5,4,MacPherson Strut,Link type,1,1,0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,Acura,2019


#### Create name lists with the data types

In [298]:
def fill_type_lists(df, list_labeled, list_numerical, list_categorical):
    for col in df.columns:
        data_type = df[col].dtypes
        # Object data
        if data_type == 'object':
            list_categorical.append(col)
        elif col in list_labeled:
            pass
        # Discrete/labeled data. Threshold 0.1 => about 278 labels
        elif df[col].nunique() <= 3 and data_type == 'int' and col != 'Model year':
            list_labeled.append(col)
        # Floats and countable integers
        else:
            list_numerical.append(col)

In [299]:
global list_labeled
global list_numerical
global list_categorical
list_labeled = []
list_numerical = []
list_categorical = []


In [300]:
list_labeled.append('Passenger Capacity')
list_labeled.append('Passenger Doors')
fill_type_lists(df, list_labeled, list_numerical, list_categorical)

### Outlier detection and removal

In [301]:
def detect_outliers(df):
    for col in df.keys():
        if col in list_numerical:
            q3 = df[col].quantile(0.75)
            q1 = df[col].quantile(0.25)
            iqr = q3 - q1
            upper_bound = q3 + iqr*1.5
            lower_bound = q1 - iqr*1.5
            df_temp = df[(df[col] > lower_bound) & (df[col] < upper_bound)]
            print('Col: {}. Lower: {}. Upper: {}. Outlier proportion {}\n'.\
                format(col, lower_bound, upper_bound, (df.shape[0] - df_temp.shape[0])/df.shape[0]))

In [302]:
detect_outliers(df)

Col: MSRP. Lower: -3975.0. Upper: 67985.0. Outlier proportion 0.07916477847108447

Col: Model year. Lower: 1990.0. Upper: 2030.0. Outlier proportion 0.007529542998570781



In [303]:
list_labeled

['Passenger Capacity',
 'Passenger Doors',
 'Air Bag-Frontal-Driver',
 'Air Bag-Frontal-Passenger',
 'Air Bag-Passenger Switch (On/Off)',
 'Air Bag-Side Body-Front',
 'Air Bag-Side Body-Rear',
 'Air Bag-Side Head-Front',
 'Air Bag-Side Head-Rear',
 'Brakes-ABS',
 'Child Safety Rear Door Locks',
 'Daytime Running Lights',
 'Traction Control',
 'Night Vision',
 'Rollover Protection Bars',
 'Fog Lamps',
 'Parking Aid',
 'Tire Pressure Monitor',
 'Back-Up Camera',
 'Stability Control']

### Use Inter quartile range of the target variable

In [304]:
# Calculate some descriptive statistics 
mean = np.mean(df['MSRP'])
median = np.percentile(df['MSRP'], 50)
stdev= np.std(df['MSRP'])
q25, q75 = np.percentile(df['MSRP'], 25), np.percentile(df['MSRP'], 75)
iqr = q75 - q25
cut_off = iqr * 1.5
price_lower_bound, price_upper_bound = q25 - cut_off, q75 + cut_off
print('Lower bound: %.3f, upper bound: %.3f' % (price_lower_bound, price_upper_bound))
print('Percentiles: 25th=%.3f, 75th=%.3f, IQR=%.3f' % (q25, q75, iqr))
print('Mean:', mean)
print('StDev:', stdev)
print('median:', median)

Lower bound: -3975.000, upper bound: 67985.000
Percentiles: 25th=23010.000, 75th=41000.000, IQR=17990.000
Mean: 38091.75846202113
StDev: 33741.803367816676
median: 30520.0


In [305]:
# Cut entries outside the IQR
df = df[df['MSRP'] <= price_upper_bound]

In [306]:
df.shape

(26416, 25)

### Encode the categorical features

In [307]:
def one_hot_encoder(df, col_to_encode, dummy_na=False):
    '''
    INPUT:
    df - pandas dataframe with categorical variables you want to dummy
    dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not
    '''
    df_ret = df
    
    for col in col_to_encode:
        df_ret = pd.concat([df_ret.drop(col, axis=1),pd.get_dummies(df_ret[col], prefix=col,\
            dummy_na=dummy_na, drop_first=True)],axis=1)
    return df_ret

In [308]:
df = one_hot_encoder(df, list_categorical)
df = one_hot_encoder(df, list_labeled)

In [309]:
#plt.figure(figsize=(10,5))
c = df.corr()['MSRP'].sort_values()
#sns.heatmap(c,cmap='BrBG', annot= True)
c

Suspension Type - Rear_Others              -0.208658
Suspension Type - Rear_Torsion Bar         -0.204699
Suspension Type - Front_Others             -0.196590
Manufacturer_Hyundai                       -0.167721
Passenger Capacity_3                       -0.148975
Manufacturer_Subaru                        -0.142770
Suspension Type - Front_MacPherson Strut   -0.130451
Manufacturer_Volkswagen                    -0.124310
Passenger Doors_2                          -0.123218
Manufacturer_Honda                         -0.120307
Manufacturer_Kia                           -0.115339
Manufacturer_Mitsubishi                    -0.109548
Manufacturer_Toyota                        -0.107726
Manufacturer_Mazda                         -0.084114
Air Bag-Passenger Switch (On/Off)_1        -0.082628
Suspension Type - Rear_MacPherson Strut    -0.079339
Passenger Capacity_5                       -0.074086
Manufacturer_Nissan                        -0.072411
Manufacturer_Ford                          -0.

### Split into feature set and target vector

In [310]:
X = df.drop('MSRP', axis=1)
y = df['MSRP']

### Train test split

In [311]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42) 

### Feature scaling

In [312]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

In [313]:
X_train.head()

Unnamed: 0,Model year,Suspension Type - Front_Axle,Suspension Type - Front_Coil Spring,Suspension Type - Front_Double Wishbone,Suspension Type - Front_Independent,Suspension Type - Front_Leaf type,Suspension Type - Front_Link type,Suspension Type - Front_MacPherson Strut,Suspension Type - Front_Others,Suspension Type - Front_Torsion Bar,Suspension Type - Rear_Axle,Suspension Type - Rear_Coil Spring,Suspension Type - Rear_Double Wishbone,Suspension Type - Rear_Independent,Suspension Type - Rear_Leaf type,Suspension Type - Rear_Link type,Suspension Type - Rear_MacPherson Strut,Suspension Type - Rear_Others,Suspension Type - Rear_Torsion Bar,Manufacturer_Alfa Romeo,Manufacturer_Audi,Manufacturer_BMW,Manufacturer_Buick,Manufacturer_Cadillac,Manufacturer_Chevrolet,Manufacturer_Chrysler,Manufacturer_Dodge,Manufacturer_Fiat,Manufacturer_Ford,Manufacturer_GMC,Manufacturer_Genesis,Manufacturer_Honda,Manufacturer_Hyundai,Manufacturer_Infiniti,Manufacturer_Jaguar,Manufacturer_Jeep,Manufacturer_Kia,Manufacturer_Land Rover,Manufacturer_Lexus,Manufacturer_Lincoln,Manufacturer_Lotus,Manufacturer_Maserati,Manufacturer_Mazda,Manufacturer_Mercedes-Benz,Manufacturer_Mini,Manufacturer_Mitsubishi,Manufacturer_Nissan,Manufacturer_Porsche,Manufacturer_Ram,Manufacturer_Smart,Manufacturer_Subaru,Manufacturer_Toyota,Manufacturer_Volkswagen,Manufacturer_Volvo,Passenger Capacity_2,Passenger Capacity_3,Passenger Capacity_4,Passenger Capacity_5,Passenger Capacity_6,Passenger Capacity_7,Passenger Capacity_8,Passenger Capacity_9,Passenger Capacity_12,Passenger Capacity_15,Passenger Doors_2,Passenger Doors_3,Passenger Doors_4,Air Bag-Frontal-Driver_1,Air Bag-Frontal-Passenger_1,Air Bag-Passenger Switch (On/Off)_1,Air Bag-Side Body-Front_1,Air Bag-Side Body-Rear_1,Air Bag-Side Head-Front_1,Air Bag-Side Head-Rear_1,Brakes-ABS_1,Child Safety Rear Door Locks_1,Daytime Running Lights_1,Traction Control_1,Rollover Protection Bars_1,Fog Lamps_1,Parking Aid_1,Tire Pressure Monitor_1,Back-Up Camera_1,Stability Control_1
25206,0.394465,-0.074477,-0.305875,-0.440682,2.221308,-0.065086,-0.20432,-0.803914,-0.296584,-0.251741,-0.137069,6.657408,-0.263791,-0.293877,-0.337415,-1.088779,-0.155762,-0.304393,-0.254884,-0.025483,-0.178343,-0.176885,-0.116594,-0.120811,-0.334624,-0.079798,-0.122868,-0.061201,-0.420026,-0.285321,-0.026524,-0.247092,-0.182498,-0.086078,-0.087658,-0.153747,-0.16585,-0.074477,-0.084469,-0.096041,-0.012738,0.0,-0.162199,-0.142695,-0.061201,-0.106142,-0.281759,-0.055607,5.919868,-0.034514,-0.222602,-0.330713,-0.21815,-0.121271,-0.203744,-0.253678,-0.272482,-1.034391,2.582524,-0.265896,-0.197028,-0.072241,-0.084144,-0.016446,-0.563964,-0.148886,0.689994,0.183445,0.240494,-0.356879,0.718351,-0.213345,0.785347,0.932762,0.316538,0.843596,-1.072318,0.701293,-0.158114,1.053852,-0.414487,0.707652,-0.656315,0.756282
11156,-0.835814,-0.074477,-0.305875,-0.440682,-0.450185,15.364378,-0.20432,-0.803914,-0.296584,-0.251741,-0.137069,-0.150209,-0.263791,-0.293877,-0.337415,0.91846,-0.155762,-0.304393,-0.254884,-0.025483,-0.178343,-0.176885,-0.116594,-0.120811,-0.334624,-0.079798,-0.122868,-0.061201,2.380804,-0.285321,-0.026524,-0.247092,-0.182498,-0.086078,-0.087658,-0.153747,-0.16585,-0.074477,-0.084469,-0.096041,-0.012738,0.0,-0.162199,-0.142695,-0.061201,-0.106142,-0.281759,-0.055607,-0.168923,-0.034514,-0.222602,-0.330713,-0.21815,-0.121271,-0.203744,-0.253678,-0.272482,-1.034391,2.582524,-0.265896,-0.197028,-0.072241,-0.084144,-0.016446,-0.563964,-0.148886,0.689994,0.183445,0.240494,-0.356879,-1.392077,-0.213345,-1.273322,-1.072085,0.316538,-1.185402,-1.072318,-1.425938,-0.158114,1.053852,-0.414487,-1.413124,-0.656315,-1.322258
9212,-1.519303,-0.074477,-0.305875,-0.440682,-0.450185,-0.065086,-0.20432,-0.803914,-0.296584,3.972341,-0.137069,-0.150209,-0.263791,-0.293877,-0.337415,0.91846,-0.155762,-0.304393,-0.254884,-0.025483,-0.178343,-0.176885,-0.116594,-0.120811,-0.334624,-0.079798,-0.122868,-0.061201,2.380804,-0.285321,-0.026524,-0.247092,-0.182498,-0.086078,-0.087658,-0.153747,-0.16585,-0.074477,-0.084469,-0.096041,-0.012738,0.0,-0.162199,-0.142695,-0.061201,-0.106142,-0.281759,-0.055607,-0.168923,-0.034514,-0.222602,-0.330713,-0.21815,-0.121271,-0.203744,3.941999,-0.272482,-1.034391,-0.387218,-0.265896,-0.197028,-0.072241,-0.084144,-0.016446,1.773163,-0.148886,-1.449288,0.183445,0.240494,2.802071,-1.392077,-0.213345,-1.273322,-1.072085,0.316538,-1.185402,-1.072318,-1.425938,-0.158114,1.053852,-0.414487,-1.413124,-0.656315,-1.322258
1686,1.351349,-0.074477,-0.305875,-0.440682,-0.450185,-0.065086,-0.20432,1.243913,-0.296584,-0.251741,-0.137069,-0.150209,-0.263791,-0.293877,-0.337415,0.91846,-0.155762,-0.304393,-0.254884,-0.025483,-0.178343,5.653387,-0.116594,-0.120811,-0.334624,-0.079798,-0.122868,-0.061201,-0.420026,-0.285321,-0.026524,-0.247092,-0.182498,-0.086078,-0.087658,-0.153747,-0.16585,-0.074477,-0.084469,-0.096041,-0.012738,0.0,-0.162199,-0.142695,-0.061201,-0.106142,-0.281759,-0.055607,-0.168923,-0.034514,-0.222602,-0.330713,-0.21815,-0.121271,-0.203744,-0.253678,-0.272482,0.966752,-0.387218,-0.265896,-0.197028,-0.072241,-0.084144,-0.016446,-0.563964,-0.148886,0.689994,0.183445,0.240494,-0.356879,0.718351,-0.213345,0.785347,0.932762,0.316538,0.843596,0.932559,0.701293,-0.158114,1.053852,2.412619,0.707652,1.523659,0.756282
7679,-0.015628,-0.074477,-0.305875,2.269211,-0.450185,-0.065086,-0.20432,-0.803914,-0.296584,-0.251741,-0.137069,-0.150209,-0.263791,-0.293877,-0.337415,0.91846,-0.155762,-0.304393,-0.254884,-0.025483,-0.178343,-0.176885,-0.116594,-0.120811,-0.334624,-0.079798,-0.122868,-0.061201,2.380804,-0.285321,-0.026524,-0.247092,-0.182498,-0.086078,-0.087658,-0.153747,-0.16585,-0.074477,-0.084469,-0.096041,-0.012738,0.0,-0.162199,-0.142695,-0.061201,-0.106142,-0.281759,-0.055607,-0.168923,-0.034514,-0.222602,-0.330713,-0.21815,-0.121271,-0.203744,3.941999,-0.272482,-1.034391,-0.387218,-0.265896,-0.197028,-0.072241,-0.084144,-0.016446,1.773163,-0.148886,-1.449288,0.183445,0.240494,-0.356879,-1.392077,-0.213345,-1.273322,-1.072085,0.316538,-1.185402,-1.072318,-1.425938,-0.158114,1.053852,-0.414487,0.707652,-0.656315,-1.322258


### Peincipal component analysis (PCA)

In [314]:
#X_train.isna().mean()

In [315]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [316]:
model = LinearRegression(normalize=True)
rfe = RFE(model, 20)
fit = rfe.fit(X, y)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

Num Features: 20
Selected Features: [False False False False False False False False False False  True  True
  True  True  True  True  True  True  True False False False False False
 False False False  True False False False False  True False False False
  True False False False  True  True False False  True  True False  True
 False  True False False False False False False False False False False
 False False False  True False False False False False False False False
 False False False False False False False False  True False False False]
Feature Ranking: [64 30 28 34 32 31 35 29 33 27  1  1  1  1  1  1  1  1  1 47 46 22 10 26
 12 25 11  1  7 13 42  5  1 56 15  4  1 21 20 61  1  1  2 19  1  1  8  1
 39  1  6  9  3 57 41 18 55 63 44 23 17 16 37  1 49 38 59 48 24 58 51 50
 53 62 14 60 54 40 45 43  1 65 36 52]


In [317]:
mask = fit.support_

new_features = [] # The list of your K best features

for bool, feature in zip(mask, X.columns):

    if bool:

        new_features.append(feature)
new_features

['Suspension Type - Rear_Axle',
 'Suspension Type - Rear_Coil Spring',
 'Suspension Type - Rear_Double Wishbone',
 'Suspension Type - Rear_Independent',
 'Suspension Type - Rear_Leaf type',
 'Suspension Type - Rear_Link type',
 'Suspension Type - Rear_MacPherson Strut',
 'Suspension Type - Rear_Others',
 'Suspension Type - Rear_Torsion Bar',
 'Manufacturer_Fiat',
 'Manufacturer_Hyundai',
 'Manufacturer_Kia',
 'Manufacturer_Lotus',
 'Manufacturer_Maserati',
 'Manufacturer_Mini',
 'Manufacturer_Mitsubishi',
 'Manufacturer_Porsche',
 'Manufacturer_Smart',
 'Passenger Capacity_15',
 'Parking Aid_1']

In [318]:
n_components = 10

pca = PCA(n_components=n_components)
pca.fit(X_train)

X_train_reduced = pd.DataFrame(pca.transform(X_train), index=X_train.index, columns=["PC" + str(i) for i in range(1, n_components + 1)])
X_test_reduced = pd.DataFrame(pca.transform(X_test), index=X_test.index, columns=["PC" + str(i) for i in range(1, n_components + 1)])
print(pca.explained_variance_ratio_.sum())
print(pca.noise_variance_)

0.35439677251509166
0.7241617018637871


In [319]:
loadings = pd.DataFrame(pca.components_, columns=X_train.columns, index = ["PC" + str(i) for i in range(1, n_components + 1)])
#loadings

In [320]:
models = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor()
}

#for name, model in models.items():
#    model.fit(X_train_reduced, y_train)
#    print(name + " trained.")

In [321]:
#for name, model in models.items():
#    print(name + " R^2 Score: {:.5f}".format(model.score(X_test_reduced, y_test)))