In [1]:
import numpy as np   
import pandas as pd    
import matplotlib.pyplot as plt   
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler  
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.decomposition import PCA

In [2]:
mpg_df = pd.read_csv("car-mpg.csv")  
mpg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   mpg       398 non-null    float64
 1   cyl       398 non-null    int64  
 2   disp      398 non-null    float64
 3   hp        398 non-null    object 
 4   wt        398 non-null    int64  
 5   acc       398 non-null    float64
 6   yr        398 non-null    int64  
 7   origin    398 non-null    int64  
 8   car_type  398 non-null    int64  
 9   car_name  398 non-null    object 
dtypes: float64(3), int64(5), object(2)
memory usage: 31.2+ KB


In [3]:
mpg_df = mpg_df.drop('car_name', axis=1)
mpg_df = mpg_df.replace('?', np.nan)
#mpg_df = mpg_df.apply(lambda x: x.fillna(x.median()),axis=0)
mpg_df['hp'].fillna(mpg_df['hp'].median(), inplace=True)
mpg_df['hp'] = mpg_df['hp'].astype('float64')

In [4]:
# Note: this is not correct way to prepare dummy variables. See the important notes mentioned in the code below 
mpg_df['origin'] = mpg_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
mpg_df = pd.get_dummies(mpg_df, columns=['origin'])
mpg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mpg             398 non-null    float64
 1   cyl             398 non-null    int64  
 2   disp            398 non-null    float64
 3   hp              398 non-null    float64
 4   wt              398 non-null    int64  
 5   acc             398 non-null    float64
 6   yr              398 non-null    int64  
 7   car_type        398 non-null    int64  
 8   origin_america  398 non-null    uint8  
 9   origin_asia     398 non-null    uint8  
 10  origin_europe   398 non-null    uint8  
dtypes: float64(4), int64(4), uint8(3)
memory usage: 26.2 KB


# separate independent and dependent variables

In [5]:
# Separate Xs and Y
X = mpg_df.drop('mpg', axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df[['mpg']]


In [6]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

In [7]:
# scale the data
scaler = StandardScaler() 
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# convert to dataframe
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test , columns=X.columns)

# fit a simple linear model

In [8]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

print("Intercept: ", regression_model.intercept_)
for idx, col_name in enumerate(X.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

Intercept:  [23.60071942]
The coefficient for cyl is 2.4658549637051106
The coefficient for disp is 2.4865323491071822
The coefficient for hp is -1.7590552182475436
The coefficient for wt is -5.543498107147843
The coefficient for acc is 0.11566636397208395
The coefficient for yr is 2.929128835283019
The coefficient for car_type is 2.9755683516309017
The coefficient for origin_america is -0.5809539682908239
The coefficient for origin_asia is 0.34798063885651054
The coefficient for origin_europe is 0.3755473767896586


In [9]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))


0.8343770256960538
0.8513421387780066


## Iteration 2 - Linear regression 
### Understand Rule for Dummy Variable Regression
### Remove 1 Dummy variable


In [10]:

#X_train = X_train.drop('origin_europe', axis=1)
#X_test = X_test.drop('origin_europe', axis=1)

X_train = X_train.drop('origin_america', axis=1)
X_test = X_test.drop('origin_america', axis=1)

In [11]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

print("Intercept: ", regression_model.intercept_)
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

Intercept:  [23.60071942]
The coefficient for cyl is 2.4658549637051084
The coefficient for disp is 2.4865323491071853
The coefficient for hp is -1.759055218247548
The coefficient for wt is -5.543498107147846
The coefficient for acc is 0.11566636397207916
The coefficient for yr is 2.92912883528302
The coefficient for car_type is 2.9755683516309004
The coefficient for origin_asia is 0.8272038690068866
The coefficient for origin_europe is 0.8302093396819834


In [12]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))

0.8343770256960538
0.8513421387780066


## IMPORTANT NOTE
#### The above results show that one less variable should be  defined for dummy variables
#### This can be achieved using following code (drop_first=True)
**----------------------------------------------------------------------------------------------**

mpg_df = pd.get_dummies(mpg_df, columns=['origin'], drop_first=True)

**----------------------------------------------------------------------------------------------**

# PCA

In [13]:
pca = PCA()
X_train_proj = pca.fit_transform(X_train)
X_test_proj = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
explained_variance

array([0.58373985, 0.14523146, 0.10539213, 0.07724226, 0.04388056,
       0.027316  , 0.00729541, 0.00708765, 0.0028147 ])

### Decide how many compoenents to choose

In [20]:
pca = PCA(n_components=8)
X_train_proj = pca.fit_transform(X_train)
X_test_proj = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
explained_variance

array([0.58373985, 0.14523146, 0.10539213, 0.07724226, 0.04388056,
       0.027316  , 0.00729541, 0.00708765])

In [21]:
X_train_proj = pd.DataFrame(X_train_proj)
regression_model = LinearRegression()
regression_model.fit(X_train_proj, y_train)

print("Intercept: ", regression_model.intercept_)
for idx, col_name in enumerate(X_train_proj.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

Intercept:  [23.60071942]
The coefficient for 0 is -2.877393333401684
The coefficient for 1 is -0.9607430919968013
The coefficient for 2 is 0.5038016060763665
The coefficient for 3 is -2.1219835609352575
The coefficient for 4 is -0.010789581022519947
The coefficient for 5 is -1.5357757951067734
The coefficient for 6 is -5.581870942657736
The coefficient for 7 is 3.8519463441109245


In [22]:
print(regression_model.score(X_train_proj, y_train))
print(regression_model.score(X_test_proj, y_test))


0.8333159173599629
0.8476989039964882
