In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## MPG Cylinders

In [2]:
df = sns.load_dataset('mpg')

In [3]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [4]:
df.dropna(inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 9 columns):
mpg             392 non-null float64
cylinders       392 non-null int64
displacement    392 non-null float64
horsepower      392 non-null float64
weight          392 non-null int64
acceleration    392 non-null float64
model_year      392 non-null int64
origin          392 non-null object
name            392 non-null object
dtypes: float64(4), int64(3), object(2)
memory usage: 30.6+ KB


## Modelling


### Feature Engineering

In [6]:
# Engineer Column Origin
df_one_hot = pd.get_dummies(df['origin'],drop_first=True)

In [7]:
df = pd.concat([df, df_one_hot], axis=1)

In [8]:
df.drop(['origin','name'], axis=1, inplace=True)

### Target and Feature

In [9]:
y = df['mpg']
x = df.drop(['mpg'], axis=1)

### Split to Train and Test Data

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=101, test_size=0.2)

In [12]:
lr = LinearRegression()

In [13]:
lr.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

### Evaluating Model

In [14]:
from sklearn import metrics

In [15]:
prediction = lr.predict(x_test)

In [16]:
print('MAE : '+str(metrics.mean_absolute_error(y_test,prediction)))
print('MSE : '+str(metrics.mean_squared_error(y_test, prediction)))
print('RMSE : '+str(np.sqrt(metrics.mean_squared_error(y_test,prediction))))
print('R2 Score : '+str(metrics.r2_score(y_test,prediction)))

MAE : 2.677653847546169
MSE : 11.329545943381232
RMSE : 3.3659390878893265
R2 Score : 0.792128421393254


### Overfitting and Underfitting