In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("auto-mpg.csv")
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   year          398 non-null    int64  
 7   origin        398 non-null    int64  
 8   name          398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [4]:
# Seems to be issue with horsepower column. Its dtype is object. Need to delete this record
df = df[df['horsepower'] != '?']

In [5]:
df['horsepower'] = pd.to_numeric(df['horsepower'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    int64  
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   year          392 non-null    int64  
 7   origin        392 non-null    int64  
 8   name          392 non-null    object 
dtypes: float64(3), int64(5), object(1)
memory usage: 30.6+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['horsepower'] = pd.to_numeric(df['horsepower'])


In [6]:
X = df[['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin']]
X

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,year,origin
0,8,307.0,130,3504,12.0,70,1
1,8,350.0,165,3693,11.5,70,1
2,8,318.0,150,3436,11.0,70,1
3,8,304.0,150,3433,12.0,70,1
4,8,302.0,140,3449,10.5,70,1
...,...,...,...,...,...,...,...
393,4,140.0,86,2790,15.6,82,1
394,4,97.0,52,2130,24.6,82,2
395,4,135.0,84,2295,11.6,82,1
396,4,120.0,79,2625,18.6,82,1


In [7]:
y = df[['mpg']]
y

Unnamed: 0,mpg
0,18.0
1,15.0
2,18.0
3,16.0
4,17.0
...,...
393,27.0
394,44.0
395,32.0
396,28.0


In [8]:
# Train-test split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [9]:
#Import the LR library
from sklearn.linear_model import LinearRegression
 

In [10]:
#Train the model
m1 = LinearRegression()
m1.fit(X_train, y_train)

In [11]:
m1.intercept_


array([-20.45844584])

In [12]:
m1.coef_

array([[-0.20597755,  0.01963867, -0.00161601, -0.00728288,  0.28186615,
         0.74279752,  1.38090372]])

In [13]:
# Predict on the test data
predictions = m1.predict(X_test)
predictions

array([[24.37783225],
       [29.05265074],
       [32.64731909],
       [21.38058906],
       [35.48548907],
       [28.45411854],
       [25.86605236],
       [12.73105982],
       [26.2265555 ],
       [32.38307681],
       [23.42608986],
       [31.23396896],
       [20.10984428],
       [32.18208498],
       [29.64401274],
       [14.81605147],
       [29.9272603 ],
       [32.26787362],
       [10.65804552],
       [23.07853076],
       [26.19213616],
       [18.1648049 ],
       [17.84757962],
       [15.94894931],
       [10.57917449],
       [19.75288455],
       [32.19150445],
       [18.79554932],
       [21.85050615],
       [29.04703301],
       [21.74509523],
       [24.67196122],
       [10.39679109],
       [28.46764258],
       [21.06853621],
       [14.45773381],
       [17.45135227],
       [21.65850935],
       [35.36573601],
       [25.19345377],
       [15.95879525],
       [11.77570195],
       [22.90125435],
       [22.43041065],
       [22.22142926],
       [18

In [14]:
# Merge X_test, y_test and the predictions
df1 = X_test.copy()
df1['mpg'] = y_test
df1['predicted mpg'] = predictions
df1.head(10)

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,year,origin,mpg,predicted mpg
82,4,120.0,97,2506,14.5,72,3,23.0,24.377832
167,4,97.0,75,2171,16.0,75,3,29.0,29.052651
356,4,108.0,75,2350,16.8,81,3,32.4,32.647319
120,4,121.0,112,2868,15.5,73,2,19.0,21.380589
385,4,91.0,67,1995,16.2,82,3,38.0,35.485489
238,4,98.0,83,2075,15.9,77,1,33.5,28.454119
79,4,96.0,69,2189,18.0,72,2,26.0,25.866052
93,8,318.0,150,4237,14.5,73,1,14.0,12.73106
81,4,97.0,92,2288,17.0,72,3,28.0,26.226555
337,4,107.0,72,2290,17.0,80,3,32.4,32.383077


In [15]:
df1['error'] = df1['mpg'] - df1['predicted mpg']
df1['sqerror'] = df1['error']**2
df1

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,year,origin,mpg,predicted mpg,error,sqerror
82,4,120.0,97,2506,14.5,72,3,23.0,24.377832,-1.377832,1.898422
167,4,97.0,75,2171,16.0,75,3,29.0,29.052651,-0.052651,0.002772
356,4,108.0,75,2350,16.8,81,3,32.4,32.647319,-0.247319,0.061167
120,4,121.0,112,2868,15.5,73,2,19.0,21.380589,-2.380589,5.667204
385,4,91.0,67,1995,16.2,82,3,38.0,35.485489,2.514511,6.322765
...,...,...,...,...,...,...,...,...,...,...,...
60,4,140.0,90,2408,19.5,72,1,20.0,24.143163,-4.143163,17.165800
12,8,400.0,150,3761,9.5,70,1,15.0,14.170357,0.829643,0.688307
305,4,151.0,90,2670,16.0,79,1,28.4,26.664126,1.735874,3.013260
165,8,262.0,110,3221,13.5,75,1,20.0,20.299067,-0.299067,0.089441


In [16]:
MSE = np.mean(df1['sqerror'])
MSE

11.814843625934342

In [18]:
RMSE = MSE**0.5
RMSE

3.4372727017119753