<a href="https://colab.research.google.com/github/surajkonline/R-D-AIML-workshop/blob/main/Auto_Mpg_Modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing numpy, pandas & matplotlib packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
# Reading the dataframe
df = pd.read_csv('https://raw.githubusercontent.com/surajkonline/R-D-AIML-workshop/surajkonline-Regression_auto_mpg/auto-mpg.csv') 

In [None]:
# Model buiding using SK learn
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import r2_score

In [None]:
# Removing "?" from horsepower & converting it from object to numeric
df = df.drop(df[df.horsepower =="?"].index)
df['horsepower']=pd.to_numeric(df['horsepower'])

In [None]:
df.drop(['car name'],axis=1, inplace=True)

In [None]:
X = df['weight'].values.reshape(-1,1)
y = df['mpg'].values.reshape(-1,1)

In [None]:
sns.lmplot(x = "weight", y = 'mpg' , data = df, order=2);

In [None]:
# Splitting data into training & test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
lr=LinearRegression()
lr.fit(X_train,y_train)

LinearRegression()

In [None]:
#To retrieve the intercept:
print(lr.intercept_)

#For retrieving the slope:
print(lr.coef_)

In [None]:
# Check model prediction on test data
y_pred =lr.predict(X_test)

In [None]:
# Visualize the fitted regression line
plt.scatter(X,y, color='gray')
plt.plot(X_test, y_pred, color='red', linewidth=2)
plt.title('MPG Regression Model')
plt.xlabel('Weight')
plt.ylabel('Total MPG')
plt.show()

In [None]:
# Model performance 
print('R square on training data: {}'.format(round(lr.score(X_train,y_train),2)))
print('R square on test data: {}'.format(round(lr.score(X_test,y_test),2)))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred)) 
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
# Transforming data columns with high kurtosis 
features=df.columns.tolist()
for feature in features:
    print(f'{feature} Skewness: {df[feature].skew():.2f}, Kurtosis: {df[feature].kurtosis():.2f}')

In [None]:
skew_cols=['cylinders','displacement','horsepower','weight']
df[skew_cols]=np.log1p(df[skew_cols])
for feature in features:
    print(f'{feature} skewness: {df[feature].skew():.2f}, Kurtosis: {df[feature].kurtosis():.2f}')

In [None]:
#Dropping "mpg" for X
X= df.drop('mpg', axis=1)

In [None]:
# Scaling all X
X_scaled = preprocessing.scale(X)

In [None]:
# Converting scaled X as Pandas Dataframe
X_scaled = pd.DataFrame(X_scaled,columns=X.columns)

In [None]:
# split our data into training and testing data
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=.3,random_state=0)

model = LinearRegression()  # initialize the LinearRegression model
model.fit(X_train,y_train)  # we fit the model with the training data

linear_pred = model.predict(X_test)  # make prediction with the fitted model

# score the model on the train set
print('Train score: {}\n'.format(model.score(X_train,y_train)))
# score the model on the test set
print('Test score: {}\n'.format(model.score(X_test,y_test)))
# calculate the overall accuracy of the model
print('Overall model accuracy: {}\n'.format(r2_score(y_test,linear_pred)))