In [1]:
import numpy as np   
import pandas as pd    
import matplotlib.pyplot as plt   
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso


In [2]:
mpg_df = pd.read_csv("car-mpg.csv")  
mpg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   mpg       398 non-null    float64
 1   cyl       398 non-null    int64  
 2   disp      398 non-null    float64
 3   hp        398 non-null    object 
 4   wt        398 non-null    int64  
 5   acc       398 non-null    float64
 6   yr        398 non-null    int64  
 7   origin    398 non-null    int64  
 8   car_type  398 non-null    int64  
 9   car_name  398 non-null    object 
dtypes: float64(3), int64(5), object(2)
memory usage: 31.2+ KB


In [3]:
# Data cleaning
mpg_df = mpg_df.drop('car_name', axis=1)
mpg_df = mpg_df.replace('?', np.nan)
#mpg_df = mpg_df.apply(lambda x: x.fillna(x.median()),axis=0)
mpg_df['hp'].fillna(mpg_df['hp'].median(), inplace=True)
mpg_df['hp'] = mpg_df['hp'].astype('float64')

In [4]:
# Replace Categorical variable with Dummy variable
mpg_df['origin'] = mpg_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
# Dummy variable - drop_first=True to ensure one less dummy variable is defined
mpg_df = pd.get_dummies(mpg_df, columns=['origin'], drop_first=True)
mpg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   mpg            398 non-null    float64
 1   cyl            398 non-null    int64  
 2   disp           398 non-null    float64
 3   hp             398 non-null    float64
 4   wt             398 non-null    int64  
 5   acc            398 non-null    float64
 6   yr             398 non-null    int64  
 7   car_type       398 non-null    int64  
 8   origin_asia    398 non-null    uint8  
 9   origin_europe  398 non-null    uint8  
dtypes: float64(4), int64(4), uint8(2)
memory usage: 25.8 KB


# separate independent and dependent variables

In [5]:
# Separate X and Y
X = mpg_df.drop('mpg', axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df[['mpg']]


In [6]:
#scale the data
scaler = StandardScaler()  
X_scaled = scaler.fit_transform(X)  
X_scaled = pd.DataFrame(X_scaled, columns=X.columns) 

In [7]:
# train-test split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=1)

# fit a simple linear model

In [8]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

print("Intercept: ", regression_model.intercept_)
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

Intercept:  [23.66510774]
The coefficient for cyl is 2.5059518049385003
The coefficient for disp is 2.535708286056051
The coefficient for hp is -1.788933573632526
The coefficient for wt is -5.551819873098725
The coefficient for acc is 0.11485734803440664
The coefficient for yr is 2.9318465482116074
The coefficient for car_type is 2.977869737601942
The coefficient for origin_asia is 0.8282270142957202
The coefficient for origin_europe is 0.8362781383948806


In [9]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))


0.8343770256960538
0.8513421387780067


# Create a regularized RIDGE model and note the coefficients

In [10]:
ridge = Ridge(alpha=1)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))

Ridge model: [[ 2.38873168  2.24584874 -1.77595406 -5.30121587  0.0716231   2.90348182
   2.87200551  0.8120093   0.81048147]]


In [11]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

0.8342199644938272
0.8529735352611671


# Create a regularized LASSO model and note the coefficients

In [12]:
lasso = Lasso(alpha=1)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))

# Observe, many of the coefficients have become 0 indicating drop of those dimensions from the model

Lasso model: [-0.         -0.         -0.05656463 -4.05009448  0.          2.03034296
  0.95988039  0.          0.        ]


In [13]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

0.7821044353977104
0.8251153919895945


# Let us generate polynomial models reflecting the non-linear interaction between some dimensions

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly = PolynomialFeatures(degree = 2, include_bias=False)


In [None]:
X_poly = poly.fit_transform(X_scaled )
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.30, random_state=1)
X_train.shape

In [None]:
#Formating of output can be done by getting columns using following command
#poly.get_feature_names(X_scaled.columns)

# Fit a simple non regularized linear model on poly features-

In [None]:
regression_model.fit(X_train, y_train)
print(regression_model.coef_[0])


In [None]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))

## Ridge - modify ALPHA values to get better score

In [None]:
ridge = Ridge(alpha=1)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))

In [None]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

## LASSO - modify ALPHA values to get better score

In [None]:
lasso = Lasso(alpha=1)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))

In [None]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))