Step1: Importing Libraries

In [26]:
import numpy as np
import pandas as pd

Step2: Importing data    

In [27]:
prices_df = pd.read_csv('automobile.csv', header = None, names = ['symboling','normalized_losses','make','fuel_type','aspiration','num_of_doors','bodystyle','drive_wheels','engine_location','wheel_base','length','width','height','curb_weight','engine_type','num_of_cylinders','engine_size','fuel_system','bore','stroke','compression_ratio','horsepower','peak_rpm','city_mpg','highway_mpg','price'], na_values = '?')

Step3: Feature Engineering

In [28]:
prices_df[prices_df.columns[prices_df.isnull().any()]].isnull().sum()

normalized_losses    41
num_of_doors          2
bore                  4
stroke                4
horsepower            2
peak_rpm              2
price                 4
dtype: int64

---- Dropping rows where price (target variable) does not exists

In [29]:
prices_df.dropna(subset =['price'], inplace = True, axis=0)

In [30]:
prices_df[prices_df.columns[prices_df.isnull().any()]].isnull().sum()

normalized_losses    37
num_of_doors          2
bore                  4
stroke                4
horsepower            2
peak_rpm              2
dtype: int64

--- Dropping normalized_losses because 20% of data is missing
--- Dropping make feature, it does not make sense to consider for small set of data which contains variety of cara make
--- Dropping Symboling, its for pictorial representation of safety of a car. It does not have arole in deciding price

In [31]:
prices_df.drop(['symboling', 'normalized_losses','make'], inplace = True, axis = 1)

In [33]:
prices_df

Unnamed: 0,fuel_type,aspiration,num_of_doors,bodystyle,drive_wheels,engine_location,wheel_base,length,width,height,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,...,109,mpfi,3.19,3.40,10.0,102.0,5500.0,24,30,13950.0
4,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,...,136,mpfi,3.19,3.40,8.0,115.0,5500.0,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,gas,std,four,sedan,rwd,front,109.1,188.8,68.9,55.5,...,141,mpfi,3.78,3.15,9.5,114.0,5400.0,23,28,16845.0
201,gas,turbo,four,sedan,rwd,front,109.1,188.8,68.8,55.5,...,141,mpfi,3.78,3.15,8.7,160.0,5300.0,19,25,19045.0
202,gas,std,four,sedan,rwd,front,109.1,188.8,68.9,55.5,...,173,mpfi,3.58,2.87,8.8,134.0,5500.0,18,23,21485.0
203,diesel,turbo,four,sedan,rwd,front,109.1,188.8,68.9,55.5,...,145,idi,3.01,3.40,23.0,106.0,4800.0,26,27,22470.0


Step4: Identifying Input & Target Variables

In [35]:
Input_cols = prices_df.iloc[:,:-1]
Target_col = prices_df.iloc[:,-1]

--- Forming Numerical & Categorical Columns

In [41]:
numeric_cols = prices_df.select_dtypes(include = np.number).columns[:-1].tolist()
categorical_cols = prices_df.select_dtypes(include = 'object').columns.tolist()

Step5: Imputing missing values

--- Replacing missing values with average of that features, bore, stroke, horsepower, peak_rpm

In [46]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'mean')
imputer.fit(Input_cols[numeric_cols])
Input_cols[numeric_cols] = imputer.transform(Input_cols[numeric_cols])

In [52]:
Input_cols[Input_cols.num_of_doors.isnull()]

Unnamed: 0,fuel_type,aspiration,num_of_doors,bodystyle,drive_wheels,engine_location,wheel_base,length,width,height,...,num_of_cylinders,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg
27,gas,turbo,,sedan,fwd,front,93.7,157.3,63.8,50.6,...,four,98.0,mpfi,3.03,3.39,7.6,102.0,5500.0,24.0,30.0
63,diesel,std,,sedan,fwd,front,98.8,177.8,66.5,55.5,...,four,122.0,idi,3.39,3.39,22.7,64.0,4650.0,36.0,42.0


In [53]:
prices_df.corr()

Unnamed: 0,wheel_base,length,width,height,curb_weight,engine_size,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
wheel_base,1.0,0.876024,0.814507,0.590742,0.782097,0.572027,0.494884,0.158502,0.250313,0.371621,-0.360593,-0.470606,-0.543304,0.584642
length,0.876024,1.0,0.85717,0.492063,0.880665,0.685025,0.610051,0.124139,0.159733,0.580309,-0.286321,-0.665192,-0.698142,0.690628
width,0.814507,0.85717,1.0,0.306002,0.866201,0.729436,0.544924,0.188829,0.189867,0.615315,-0.245975,-0.633531,-0.680635,0.751265
height,0.590742,0.492063,0.306002,1.0,0.307581,0.074694,0.185907,-0.062704,0.259737,-0.087407,-0.311574,-0.0498,-0.104812,0.135486
curb_weight,0.782097,0.880665,0.866201,0.307581,1.0,0.849072,0.644617,0.167562,0.156433,0.758063,-0.279375,-0.749543,-0.794889,0.834415
engine_size,0.572027,0.685025,0.729436,0.074694,0.849072,1.0,0.582857,0.209523,0.028889,0.822713,-0.256773,-0.650546,-0.679571,0.872335
bore,0.494884,0.610051,0.544924,0.185907,0.644617,0.582857,1.0,-0.05539,0.00125,0.568527,-0.277662,-0.592655,-0.599218,0.543436
stroke,0.158502,0.124139,0.188829,-0.062704,0.167562,0.209523,-0.05539,1.0,0.187923,0.10004,-0.0683,-0.034696,-0.035201,0.08231
compression_ratio,0.250313,0.159733,0.189867,0.259737,0.156433,0.028889,0.00125,0.187923,1.0,-0.214576,-0.436015,0.331425,0.268465,0.071107
horsepower,0.371621,0.580309,0.615315,-0.087407,0.758063,0.822713,0.568527,0.10004,-0.214576,1.0,0.107882,-0.822617,-0.804596,0.810533


Encoding Categorical Varaiables

In [55]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(Input_cols[categorical_cols])
encoded_cols = list(encoder.get_feature_names(categorical_cols))
Input_cols[encoded_cols] = encoder.transform(Input_cols[categorical_cols])

In [57]:
Inputs = Input_cols[numeric_cols + encoded_cols].copy()
target = prices_df['price']

0      13495.0
1      16500.0
2      16500.0
3      13950.0
4      17450.0
        ...   
200    16845.0
201    19045.0
202    21485.0
203    22470.0
204    22625.0
Name: price, Length: 201, dtype: float64

Splitting data to training & test set

In [58]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(Inputs, target, test_size = 0.2, random_state = 0)

In [59]:
train_inputs = x_train[numeric_cols + encoded_cols].copy()
test_inputs  = x_test[numeric_cols + encoded_cols].copy()
train_targets = y_train.copy()

In [60]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_inputs[numeric_cols] = scaler.fit_transform(train_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols]) 

In [61]:
train_inputs.describe().loc[['min', 'max']]

Unnamed: 0,wheel_base,length,width,height,curb_weight,engine_size,bore,stroke,compression_ratio,horsepower,...,num_of_cylinders_twelve,num_of_cylinders_two,fuel_system_1bbl,fuel_system_2bbl,fuel_system_4bbl,fuel_system_idi,fuel_system_mfi,fuel_system_mpfi,fuel_system_spdi,fuel_system_spfi
min,-2.042542,-2.731825,-2.607875,-2.394867,-2.059335,-1.578442,-2.481807,-3.563465,-0.809902,-1.514523,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3.767883,2.846833,2.893525,2.508252,2.767993,4.869764,1.767077,2.174294,3.61164,4.390565,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Training Linear Regression Model

In [62]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(train_inputs, train_targets)

LinearRegression()

In [63]:
train_pred = regressor.predict(train_inputs)
test_pred = regressor.predict(test_inputs)

RMSE & R2 Values

In [64]:
from sklearn.metrics import mean_squared_error, r2_score
train_rmse = mean_squared_error(train_targets, train_pred, squared = False)
test_rmse = mean_squared_error(y_test, regressor.predict(test_inputs), squared = False)
train_r2 = r2_score(train_targets, train_pred)
test_r2 = r2_score(y_test, regressor.predict(test_inputs))
print("train_rmse: ", train_rmse)
print("test_rmse: ", test_rmse)
print("train_r2: ", train_r2)
print("test_r2: ", test_r2)

train_rmse:  1931.2162103115368
train_rmse:  2839.4283130347726
train_r2:  0.9369143086254367
test_r2:  0.893594798870135


In [80]:
residual_train = train_targets - train_pred

## Applying k-Fold Cross Validation

In [67]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = regressor, X = train_inputs, y = train_targets, cv = 3)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 79.10 %
Standard Deviation: 4.59 %


# Training Ridge Regression

In [68]:
from sklearn.linear_model import Ridge
regressor_R = Ridge()
regressor_R.fit(train_inputs, train_targets)

Ridge()

In [70]:
train_pred_R = regressor_R.predict(train_inputs)
test_pred_R = regressor_R.predict(test_inputs)

In [71]:
from sklearn.metrics import mean_squared_error, r2_score
train_rmse = mean_squared_error(train_targets, train_pred_R, squared = False)
test_rmse = mean_squared_error(y_test, regressor_R.predict(test_inputs), squared = False)
train_r2 = r2_score(train_targets, train_pred_R)
test_r2 = r2_score(y_test, regressor_R.predict(test_inputs))
print("train_rmse: ", train_rmse)
print("test_rmse: ", test_rmse)
print("train_r2: ", train_r2)
print("test_r2: ", test_r2)

train_rmse:  2044.8531836240368
test_rmse:  2635.8854561513617
train_r2:  0.9292716812775341
test_r2:  0.9083032153164604


## Applying k-Fold Cross Validation

In [73]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = regressor_R, X = train_inputs, y = train_targets, cv = 3)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 84.73 %
Standard Deviation: 1.18 %
