In [10]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression


In [11]:
df= pd.read_csv('autos_dataset.csv')
df

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845
201,-1,95,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045
202,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485
203,-1,95,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.4,23.0,106,4800,26,27,22470


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

In [13]:
df.replace({'?':np.nan},inplace=True)

df.isna().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

In [14]:
# converting all object datatype to float which include 'NaN values'

df['normalized-losses']=df['normalized-losses'].astype(float)
df['num-of-doors'].replace({'two':2,'four':4}, inplace=True)
df['bore']=df['bore'].astype(float)
df['stroke']=df['stroke'].astype(float)
df['horsepower']=df['horsepower'].astype(float)
df['peak-rpm']=df['peak-rpm'].astype(float)
df['price']=df['price'].astype(float)

In [15]:
# Filling NaN values and changing datatype to int

df['normalized-losses']=df['normalized-losses'].fillna(df['normalized-losses'].median()).astype(int)
df['num-of-doors']=df['num-of-doors'].fillna(df['num-of-doors'].median()).astype(int)
df['bore']=df['bore'].fillna(df['bore'].median())
df['stroke']=df['stroke'].fillna(df['stroke'].median())
df['horsepower']=df['horsepower'].fillna(df['horsepower'].median()).astype(int)
df['peak-rpm']=df['peak-rpm'].fillna(df['peak-rpm'].median()).astype(int)
df['price']=df['price'].fillna(df['price'].median()).astype(int)

In [16]:
# One Hot Encoding

df=pd.get_dummies(data=df, columns=['fuel-type','aspiration','num-of-doors','body-style','drive-wheels',
                     'engine-location','engine-type','num-of-cylinders','fuel-system'])

In [17]:
df.head().T

Unnamed: 0,0,1,2,3,4
symboling,3,3,1,2,2
normalized-losses,115,115,115,164,164
make,alfa-romero,alfa-romero,alfa-romero,audi,audi
wheel-base,88.6,88.6,94.5,99.8,99.4
length,168.8,168.8,171.2,176.6,176.6
width,64.1,64.1,65.5,66.2,66.4
height,48.8,48.8,52.4,54.3,54.3
curb-weight,2548,2548,2823,2337,2824
engine-size,130,130,152,109,136
bore,3.47,3.47,2.68,3.19,3.19


In [22]:
x=df.drop(['price','make'],axis=1)
y=df['price']

In [23]:
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2,random_state=101)

In [24]:
lm=LinearRegression()

In [25]:
lm.fit(x_train,y_train)

LinearRegression()

In [28]:
pred=lm.predict(x_test)
pred[10:15]

array([11776.08791619, 16473.97639343,  6667.49692173,  5550.06643043,
       10860.98648248])

In [29]:
y_test[10:15]

145    11259
65     18280
78      6669
152     6488
149    11694
Name: price, dtype: int32

In [31]:
#Evaluation on Test-Prediction Data

mse=metrics.mean_squared_error(y_test,pred)
rmse=np.sqrt(mse)
r2=metrics.r2_score(y_test,pred)
print('MSE:',mse,'\nRMSE:',rmse,'\nR2 Score:',r2)

MSE: 7314510.101409043 
RMSE: 2704.5350989419685 
R2 Score: 0.781684961559015


In [32]:
#Evaluation on Train-Prediction Data

train_pred=lm.predict(x_train)
mse=metrics.mean_squared_error(y_train,train_pred)
rmse=np.sqrt(mse)
r2=metrics.r2_score(y_train,train_pred)
print('MSE:',mse,'\nRMSE:',rmse,'\nR2 Score:',r2)

MSE: 4091761.877306736 
RMSE: 2022.8103908440692 
R2 Score: 0.9390391560103913


In [36]:
x.columns

Index(['symboling', 'normalized-losses', 'wheel-base', 'length', 'width',
       'height', 'curb-weight', 'engine-size', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'fuel-type_diesel', 'fuel-type_gas', 'aspiration_std',
       'aspiration_turbo', 'num-of-doors_2', 'num-of-doors_4',
       'body-style_convertible', 'body-style_hardtop', 'body-style_hatchback',
       'body-style_sedan', 'body-style_wagon', 'drive-wheels_4wd',
       'drive-wheels_fwd', 'drive-wheels_rwd', 'engine-location_front',
       'engine-location_rear', 'engine-type_dohc', 'engine-type_dohcv',
       'engine-type_l', 'engine-type_ohc', 'engine-type_ohcf',
       'engine-type_ohcv', 'engine-type_rotor', 'num-of-cylinders_eight',
       'num-of-cylinders_five', 'num-of-cylinders_four',
       'num-of-cylinders_six', 'num-of-cylinders_three',
       'num-of-cylinders_twelve', 'num-of-cylinders_two', 'fuel-system_1bbl',
       'fuel-system_2bbl', 'fuel-sys

In [35]:
json_data={'columns':list(x.columns)}

import json
import pickle

with open ('Json_data.json','w') as f:
    json.dump(json_data,f)

with open ('Autos_Lin_Model.pkl','wb') as f:
    pickle.dump(lm,f)

In [38]:
38-14


24

In [42]:
arr=np.zeros(10)
arr


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [45]:
arr[1,2]=1

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [44]:
arr

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0.])