In [1]:
#libraries
#the data set can be downloaded from this website 
#"https://www.kaggle.com/ahmedsmara/advanced-automobile-regression/data?select=imports-85.csv"
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

### EXPLORING DATA

In [2]:
df=pd.read_csv("Automobile.csv")
df[0:5]

Unnamed: 0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.60,...,130,mpfi,3.47,2.68,9.00,111,5000,21,27,13495
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
1,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
2,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
3,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
4,2,?,audi,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,15250


In [3]:
df.columns

Index(['3', '?', 'alfa-romero', 'gas', 'std', 'two', 'convertible', 'rwd',
       'front', '88.60', '168.80', '64.10', '48.80', '2548', 'dohc', 'four',
       '130', 'mpfi', '3.47', '2.68', '9.00', '111', '5000', '21', '27',
       '13495'],
      dtype='object')

In [4]:
df.gas.isnull().any()

False

In [5]:
df.isnull().any()

3              False
?              False
alfa-romero    False
gas            False
std            False
two            False
convertible    False
rwd            False
front          False
88.60          False
168.80         False
64.10          False
48.80          False
2548           False
dohc           False
four           False
130            False
mpfi           False
3.47           False
2.68           False
9.00           False
111            False
5000           False
21             False
27             False
13495          False
dtype: bool

### PREPROCESSING

In [6]:
col=['symboling','normalized_losses','make','fuel_type','aspiration','num_of_doors','body_style','drive_wheels','engine_locatio','wheel_base'
     ,'length','width','height','curb_weight','engine_type','num_of_cylinders','engine_size','fuel_system'
     ,'bore','stroke','compression_ratio','horsepower','peak_rpm','city_mpg','highway_mpg','price']


In [7]:
df=pd.read_csv("Automobile.csv",header=None,na_values="?",names=col)
df[0:5]

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_locatio,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [8]:
df.drop(['make','symboling','normalized_losses'],axis=1,inplace=True)
df.head()

Unnamed: 0,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_locatio,wheel_base,length,width,height,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [9]:
df[df.num_of_doors.isnull()]

Unnamed: 0,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_locatio,wheel_base,length,width,height,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
27,gas,turbo,,sedan,fwd,front,93.7,157.3,63.8,50.6,...,98,mpfi,3.03,3.39,7.6,102.0,5500.0,24,30,8558.0
63,diesel,std,,sedan,fwd,front,98.8,177.8,66.5,55.5,...,122,idi,3.39,3.39,22.7,64.0,4650.0,36,42,10795.0


In [10]:
df.num_of_doors[df.body_style=='sedan'].value_counts()

four    79
two     15
Name: num_of_doors, dtype: int64

In [11]:
df.loc[27,'num_of_doors']='four'
df.loc[63,'num_of_doors']='four'

In [12]:
df.isnull().any()

fuel_type            False
aspiration           False
num_of_doors         False
body_style           False
drive_wheels         False
engine_locatio       False
wheel_base           False
length               False
width                False
height               False
curb_weight          False
engine_type          False
num_of_cylinders     False
engine_size          False
fuel_system          False
bore                  True
stroke                True
compression_ratio    False
horsepower            True
peak_rpm              True
city_mpg             False
highway_mpg          False
price                 True
dtype: bool

In [13]:
df[df.bore.isnull()]

Unnamed: 0,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_locatio,wheel_base,length,width,height,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
55,gas,std,two,hatchback,rwd,front,95.3,169.0,65.7,49.6,...,70,4bbl,,,9.4,101.0,6000.0,17,23,10945.0
56,gas,std,two,hatchback,rwd,front,95.3,169.0,65.7,49.6,...,70,4bbl,,,9.4,101.0,6000.0,17,23,11845.0
57,gas,std,two,hatchback,rwd,front,95.3,169.0,65.7,49.6,...,70,4bbl,,,9.4,101.0,6000.0,17,23,13645.0
58,gas,std,two,hatchback,rwd,front,95.3,169.0,65.7,49.6,...,80,mpfi,,,9.4,135.0,6000.0,16,23,15645.0


In [14]:
df.bore.fillna(df.bore.mean(),inplace=True)

In [15]:
df.stroke.fillna(df.stroke.mean(),inplace=True)
df.horsepower.fillna(df.horsepower.mean(),inplace=True)
df.peak_rpm.fillna(df.peak_rpm.mean(),inplace=True)

In [16]:
df.drop(df[df.price.isnull()].index,axis=0,inplace=True)

In [17]:
df.isnull().any()

fuel_type            False
aspiration           False
num_of_doors         False
body_style           False
drive_wheels         False
engine_locatio       False
wheel_base           False
length               False
width                False
height               False
curb_weight          False
engine_type          False
num_of_cylinders     False
engine_size          False
fuel_system          False
bore                 False
stroke               False
compression_ratio    False
horsepower           False
peak_rpm             False
city_mpg             False
highway_mpg          False
price                False
dtype: bool

In [18]:
df.num_of_cylinders.value_counts()

four      157
six        24
five       10
eight       4
two         4
three       1
twelve      1
Name: num_of_cylinders, dtype: int64

In [19]:
df.loc[df.num_of_cylinders=='two','num_of_cylinders']=2
df.loc[df.num_of_cylinders=='four','num_of_cylinders']=4
df.loc[df.num_of_cylinders=='five','num_of_cylinders']=5
df.loc[df.num_of_cylinders=='six','num_of_cylinders']=6
df.loc[df.num_of_cylinders=='eight','num_of_cylinders']=8
df.loc[df.num_of_cylinders=='twelve','num_of_cylinders']=12
df.loc[df.num_of_cylinders=='three','num_of_cylinders']=3


In [20]:
df[0:5]

Unnamed: 0,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_locatio,wheel_base,length,width,height,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [21]:
df.fuel_type.value_counts()

gas       181
diesel     20
Name: fuel_type, dtype: int64

In [22]:
pd.set_option('display.max_columns', None)

In [23]:
df.num_of_doors.value_counts()

four    115
two      86
Name: num_of_doors, dtype: int64

In [24]:
df.dtypes

fuel_type             object
aspiration            object
num_of_doors          object
body_style            object
drive_wheels          object
engine_locatio        object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_of_cylinders       int64
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
dtype: object

In [25]:
col=['fuel_type', 'aspiration', 'num_of_doors', 'body_style', 'drive_wheels',
       'engine_locatio', 'engine_type','fuel_system']
df=pd.get_dummies(df,columns=col,drop_first=True)

In [26]:
df[0:5]

Unnamed: 0,wheel_base,length,width,height,curb_weight,num_of_cylinders,engine_size,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price,fuel_type_gas,aspiration_turbo,num_of_doors_two,body_style_hardtop,body_style_hatchback,body_style_sedan,body_style_wagon,drive_wheels_fwd,drive_wheels_rwd,engine_locatio_rear,engine_type_l,engine_type_ohc,engine_type_ohcf,engine_type_ohcv,engine_type_rotor,fuel_system_2bbl,fuel_system_4bbl,fuel_system_idi,fuel_system_mfi,fuel_system_mpfi,fuel_system_spdi,fuel_system_spfi
0,88.6,168.8,64.1,48.8,2548,4,130,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
1,88.6,168.8,64.1,48.8,2548,4,130,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
2,94.5,171.2,65.5,52.4,2823,6,152,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0,1,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0
3,99.8,176.6,66.2,54.3,2337,4,109,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0
4,99.4,176.6,66.4,54.3,2824,5,136,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0


### training

In [27]:
x_data=df.drop('price',axis=1)
y_data=df['price']

In [28]:
X_train,X_test,y_train,y_test=train_test_split(x_data,y_data,test_size=0.2,random_state=1)

In [29]:
lre=LinearRegression()

In [30]:
lre.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [31]:
yhat=lre.predict(X_test)

### ACCURACY

In [32]:
from sklearn.metrics import r2_score

In [51]:
accuracy=r2_score(y_test,yhat)*100
print("TEST ACCURACY ", accuracy)

TEST ACCURACY  86.65904289543191


In [53]:
train_accu=r2_score(y_train,lre.predict(X_train))
print("TRAIN ACCURACY " , train_accu)

TRAIN ACCURACY  0.9288328123786973


In [54]:
X1_train,X1_test,y1_train,y1_test=train_test_split(x_data,y_data,test_size=0.15,random_state=1)
lre1=LinearRegression()
lre1.fit(X1_train,y1_train)
yhat1=lre.predict(X1_test)
accuracy1=r2_score(y1_test,yhat1)*100
print("TEST ACCURACY"  ,  accuracy1)

TEST ACCURACY 90.00635977903364


In [55]:
train_accu1=r2_score(y1_train,lre1.predict(X1_train))
print("TRAIN ACCURACY " , train_accu1)

TRAIN ACCURACY  0.9228057841073096
