In [None]:
import numpy as np   
import pandas as pd    
import matplotlib.pyplot as plt   
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso


In [None]:
mpg_df = pd.read_csv("car-mpg.csv")  
mpg_df.info()

In [None]:
# Data cleaning
mpg_df = mpg_df.drop('car_name', axis=1)
mpg_df = mpg_df.replace('?', np.nan)
#mpg_df = mpg_df.apply(lambda x: x.fillna(x.median()),axis=0)
mpg_df['hp'].fillna(mpg_df['hp'].median(), inplace=True)
mpg_df['hp'] = mpg_df['hp'].astype('float64')

In [None]:
# Replace Categorical variable with Dummy variable
mpg_df['origin'] = mpg_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
# Dummy variable - drop_first=True to ensure one less dummy variable is defined
mpg_df = pd.get_dummies(mpg_df, columns=['origin'], drop_first=True)
mpg_df.info()

# separate independent and dependent variables

In [None]:
# Separate X and Y
X = mpg_df.drop('mpg', axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df[['mpg']]


In [None]:
#scale the data
scaler = StandardScaler()  
X_scaled = scaler.fit_transform(X)  
X_scaled = pd.DataFrame(X_scaled, columns=X.columns) 

In [None]:
# train-test split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=1)

# fit a simple linear model

In [None]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

print("Intercept: ", regression_model.intercept_)
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

In [None]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))


# Create a regularized RIDGE model and note the coefficients

In [None]:
ridge = Ridge(alpha=1)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))

In [None]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

# Create a regularized LASSO model and note the coefficients

In [None]:
lasso = Lasso(alpha=1)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))

# Observe, many of the coefficients have become 0 indicating drop of those dimensions from the model

In [None]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

# Let us generate polynomial models reflecting the non-linear interaction between some dimensions

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly = PolynomialFeatures(degree = 2, include_bias=False)


In [None]:
X_poly = poly.fit_transform(X_scaled )
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.30, random_state=1)
X_train.shape

In [None]:
#Formating of output can be done by getting columns using following command
#poly.get_feature_names(X_scaled.columns)

# Fit a simple non regularized linear model on poly features-

In [None]:
regression_model.fit(X_train, y_train)
print(regression_model.coef_[0])


In [None]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))

## Ridge - modify ALPHA values to get better score

In [None]:
ridge = Ridge(alpha=1)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))

In [None]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

## LASSO - modify ALPHA values to get better score

In [None]:
lasso = Lasso(alpha=1)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))

In [None]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))