In [1]:
import pandas as pd
from pandas import DataFrame as df
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dataset = pd.read_csv('Auto MPG.csv')

In [3]:
dataset.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,26.0,4,97.0,46,1835,20.5,70,2,volkswagen 1131 deluxe sedan
1,26.0,4,97.0,46,1950,21.0,73,2,volkswagen super beetle
2,43.1,4,90.0,48,1985,21.5,78,2,volkswagen rabbit custom diesel
3,44.3,4,90.0,48,2085,21.7,80,2,vw rabbit c (diesel)
4,43.4,4,90.0,48,2335,23.7,80,2,vw dasher (diesel)


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 9 columns):
mpg             397 non-null float64
cylinders       397 non-null int64
displacement    397 non-null float64
horsepower      397 non-null object
weight          397 non-null int64
acceleration    397 non-null float64
year            397 non-null int64
origin          397 non-null int64
name            397 non-null object
dtypes: float64(3), int64(4), object(2)
memory usage: 28.0+ KB


In [5]:
dataset.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
year            0
origin          0
name            0
dtype: int64

In [6]:
dataset.horsepower[dataset.horsepower=='?']= None

In [7]:
dataset.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      5
weight          0
acceleration    0
year            0
origin          0
name            0
dtype: int64

In [8]:
dataset = dataset.dropna()

In [9]:
dataset.drop('name', axis=1, inplace=True)

In [10]:
numeric_features = dataset.select_dtypes(include=[np.number])
numeric_features.dtypes
corr =numeric_features.corr()

print(corr['mpg'].sort_values(ascending=False))

mpg             1.000000
year            0.580541
origin          0.565209
acceleration    0.423329
cylinders      -0.777618
displacement   -0.805127
weight         -0.832244
Name: mpg, dtype: float64


In [11]:
column = dataset.columns
x = dataset[column]
x.drop('mpg', axis=1,inplace=True)
y = dataset['mpg']

In [12]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,
                    test_size=0.2,random_state=42)

In [13]:
x_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,year,origin
258,6,258.0,110,2962,13.5,71,1
182,4,151.0,90,2950,17.3,82,1
172,6,232.0,90,3085,17.6,76,1
63,4,98.0,70,2125,17.3,82,1
340,8,318.0,150,4457,13.5,74,1


In [14]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,2))
x_train = df(scaler.fit_transform(x_train))
x_test = df(scaler.transform(x_test))

In [15]:
x_train.head()

Unnamed: 0,0,1,2,3,4,5,6
0,1.2,0.976623,0.695652,0.764956,0.613497,0.166667,0.0
1,0.4,0.420779,0.478261,0.758151,1.079755,2.0,0.0
2,1.2,0.841558,0.478261,0.834704,1.116564,1.0,0.0
3,0.4,0.145455,0.26087,0.290332,1.079755,2.0,0.0
4,2.0,1.288312,1.130435,1.612702,0.613497,0.666667,0.0


In [16]:
from sklearn.linear_model import LinearRegression 
lr = LinearRegression(normalize=True)
lr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [17]:
from sklearn.metrics import mean_absolute_error, r2_score

y_pred = lr.predict(x_test)
print('MSE : ',mean_absolute_error(y_test, y_pred))
print('R2 : ',r2_score(y_test, y_pred))

MSE :  2.9187002446972525
R2 :  0.7684826732587055


In [18]:
from sklearn.linear_model import Ridge
rr = Ridge(normalize=True, random_state=42)
rr.fit(x_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None, normalize=True,
      random_state=42, solver='auto', tol=0.001)

In [19]:
y_pred_r = rr.predict(x_test)
print('MSE : ',mean_absolute_error(y_test, y_pred_r))
print('R2 : ',r2_score(y_test, y_pred_r))

MSE :  3.2936806893755883
R2 :  0.6936005757669431


In [20]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
rf.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [21]:
y_pred_f = rf.predict(x_test)
print('MSE : ',mean_absolute_error(y_test, y_pred_f))
print('R2 : ',r2_score(y_test, y_pred_f))

MSE :  2.2367088607594936
R2 :  0.8259358253738212


In [22]:
d_range = range(1, 100)
dscores = []

for k in d_range:
    dt = RandomForestRegressor(n_estimators=35, max_depth=k, random_state=42)
    dt.fit(x_train, y_train)
    y_pred = dt.predict(x_test)
    dscores.append(r2_score(y_test, y_pred))
max = np.array(dscores).max()
m=[]
m = df(dscores)
m[1] = d_range
print('Best Parameter: ')
print(m[m[0] == max].min())

Best Parameter: 
0    0.845862
1    7.000000
dtype: float64


In [23]:
rf = RandomForestRegressor(random_state=42, n_estimators=35, max_depth=7)
rf.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=7,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=35,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [24]:
y_pred_rf = rf.predict(x_test)
df({'Predicted':y_pred_rf,'Actual':y_test}).head(10)

Unnamed: 0,Predicted,Actual
78,18.05034,15.0
274,24.612961,21.5
246,17.476967,18.0
55,35.708092,31.0
387,13.623194,14.0
203,20.030366,20.5
42,30.683433,31.0
233,17.862667,15.0
150,18.345211,18.0
116,32.658182,35.7
