In [1]:
import pandas as pd

In [2]:
auto = pd.read_csv('dataset/clean_car.csv')
auto.drop(['origin'], inplace = True, axis=1)
auto.drop(['car name'], inplace = True, axis=1)
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year
0,18.0,8,307,130,3504,12.0,70
1,18.0,8,318,150,3436,11.0,70
2,16.0,8,304,150,3433,12.0,70
3,17.0,8,302,140,3449,10.5,70
4,15.0,8,429,198,4341,10.0,70


In [3]:
auto.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year
count,382.0,382.0,382.0,382.0,382.0,382.0,382.0
mean,23.523037,5.463351,193.560209,104.091623,2975.874346,15.593455,76.10733
std,7.846198,1.706085,104.326412,38.362882,848.963973,2.747996,3.638739
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,17.125,4.0,105.0,75.0,2220.75,13.9,73.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0
75%,29.0,8.0,265.75,125.0,3612.0,17.2,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


In [4]:
# standardizing values

from sklearn import preprocessing

auto[['cylinders']] = preprocessing.scale(auto[['cylinders']].astype('float64'))
auto[['displacement']] = preprocessing.scale(auto[['displacement']].astype('float64'))
auto[['horsepower']] = preprocessing.scale(auto[['horsepower']].astype('float64'))
auto[['weight']] = preprocessing.scale(auto[['weight']].astype('float64'))
auto[['acceleration']] = preprocessing.scale(auto[['acceleration']].astype('float64'))
auto[['model year']] = preprocessing.scale(auto[['model year']].astype('float64'))

In [5]:
auto.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year
count,382.0,382.0,382.0,382.0,382.0,382.0,382.0
mean,23.523037,1.488048e-16,-7.440238e-17,1.302042e-16,-1.488048e-16,2.604083e-16,1.26484e-15
std,7.846198,1.001311,1.001311,1.001311,1.001311,1.001311,1.001311
min,9.0,-1.445755,-1.205111,-1.516252,-1.607444,-2.766895,-1.680621
25%,17.125,-0.8588492,-0.8499895,-0.7593219,-0.8906322,-0.6170594,-0.8550779
50%,23.0,-0.8588492,-0.4324821,-0.2764525,-0.2033071,-0.03405321,-0.02953513
75%,29.0,1.488774,0.6928683,0.5457306,0.7502791,0.5853909,0.7960077
max,46.6,1.488774,2.509265,3.286341,2.55248,3.35467,1.62155


In [6]:
auto.shape

(382, 7)

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [8]:
X = auto.drop(['mpg'], axis=1)
Y = auto['mpg']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size= 0.2)

In [9]:
linear_model = LinearRegression(normalize = True).fit(x_train, y_train)

In [10]:
print('training score is ', linear_model.score(x_train, y_train))

training score is  0.8132028552978101


In [11]:
y_pred = linear_model.predict(x_test)
print("testing score is", r2_score(y_test, y_pred))

testing score is 0.7891319894249172


In [12]:
def adjusted_r2(r_square, labels, features):
    
    adj_r = 1-((1-r_square)*(len(labels)-1))/ (len(labels)-features.shape[1]-1)
    
    return adj_r

In [13]:
print("adjusted r2 score is ", adjusted_r2(r2_score(y_test, y_pred), y_test, x_test))

adjusted r2 score is  0.7710575885184816


In [14]:
fe_corr = X.corr()
fe_corr

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year
cylinders,1.0,0.95008,0.84321,0.897583,-0.500514,-0.347108
displacement,0.95008,1.0,0.897861,0.933813,-0.537683,-0.369146
horsepower,0.84321,0.897861,1.0,0.866438,-0.68231,-0.415697
weight,0.897583,0.933813,0.866438,1.0,-0.413322,-0.315549
acceleration,-0.500514,-0.537683,-0.68231,-0.413322,1.0,0.271797
model year,-0.347108,-0.369146,-0.415697,-0.315549,0.271797,1.0


In [15]:
abs(fe_corr)>0.8

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year
cylinders,True,True,True,True,False,False
displacement,True,True,True,True,False,False
horsepower,True,True,True,True,False,False
weight,True,True,True,True,False,False
acceleration,False,False,False,False,True,False
model year,False,False,False,False,False,True


In [16]:
trimmed_x = X.drop(['cylinders', 'displacement', 'weight'], axis=1)
trim_x_fe_corr = trimmed_x.corr()
trim_x_fe_corr


Unnamed: 0,horsepower,acceleration,model year
horsepower,1.0,-0.68231,-0.415697
acceleration,-0.68231,1.0,0.271797
model year,-0.415697,0.271797,1.0


In [17]:
abs(trim_x_fe_corr) > 0.8

Unnamed: 0,horsepower,acceleration,model year
horsepower,True,False,False
acceleration,False,True,False
model year,False,False,True


In [18]:
#using variation inflation factor
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [19]:
vif = pd.DataFrame()
vif["VIF Factor"]= [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

In [20]:
vif["features"] = X.columns
vif.round(2)

Unnamed: 0,VIF Factor,features
0,10.49,cylinders
1,19.46,displacement
2,9.4,horsepower
3,10.78,weight
4,2.57,acceleration
5,1.24,model year


In [21]:
X = X.drop(['displacement', 'weight'], axis=1)
vif = pd.DataFrame()
vif["VIF Factor"]= [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns
vif.round(2)

Unnamed: 0,VIF Factor,features
0,3.59,cylinders
1,5.28,horsepower
2,1.94,acceleration
3,1.21,model year


In [22]:
X = auto.drop(['mpg','displacement', 'weight'], axis=1)
Y = auto['mpg']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size= 0.2)
linear_model = LinearRegression(normalize = True).fit(x_train, y_train)
print('training score is ', linear_model.score(x_train, y_train))
y_pred = linear_model.predict(x_test)
print("testing score is", r2_score(y_test, y_pred))
print("adjusted r2 score is ", adjusted_r2(r2_score(y_test, y_pred), y_test, x_test))


training score is  0.7337724218557966
testing score is 0.8090916187725985
adjusted r2 score is  0.7984855975932984
