In [9]:
from sklearn import datasets
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pandas as pd
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error

In [17]:
# Read dataset
df = pd.read_csv("../data/fuel_consumption/Fuel_Consumption_Ratings.csv") # changed index_col
df.head()

Unnamed: 0,Model Year,Make,Model,Vehicle Class,Engine Size,Cylinders,Transmission,Fuel,Fuel Consumption City,Fuel Consumption Hwy,Fuel Consumption,Fuel Consumption Comb_mpg,CO2 Emissions,CO2 Rating,Smog Rating
0,2022,Acura,ILX,Compact,2.4,4,AM8,Z,9.9,7.0,8.6,33,200,6,3
1,2022,Acura,MDX SH-AWD,SUV: Small,3.5,6,AS10,Z,12.6,9.4,11.2,25,263,4,5
2,2022,Acura,RDX SH-AWD,SUV: Small,2.0,4,AS10,Z,11.0,8.6,9.9,29,232,5,6
3,2022,Acura,RDX SH-AWD A-SPEC,SUV: Small,2.0,4,AS10,Z,11.3,9.1,10.3,27,242,5,6
4,2022,Acura,TLX SH-AWD,Compact,2.0,4,AS10,Z,11.2,8.0,9.8,29,230,5,7


In [20]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['Fuel Consumption'], y=df['CO2 Emissions'],mode='markers'))
fig.update_layout(title=f"Fuel Consumption by Engine Size", yaxis_title='Fuel Consumption', xaxis_title='CO2 Emissions')
fig.show()

In [74]:
X_df = df[['Fuel Consumption','Engine Size','Cylinders']]
y_df = df[['CO2 Emissions']]

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=1)

In [76]:
linReg = LinearRegression()
linReg.fit(X_train, y_train)

LinearRegression()

In [77]:
print ('Intercept: ',linReg.intercept_)    #17.41832581
print ('Coefficients: ', linReg.coef_[0])  #21.80188686

Intercept:  [21.96963797]
Coefficients:  [19.4271568   1.12060648  3.17912023]


In [56]:
X_min=np.min(X_df['Fuel Consumption'].values)
X_max = np.max(X_df['Fuel Consumption'].values)
y_min = linReg.intercept_[0]+linReg.coef_[0][0]*X_min
y_max = linReg.intercept_[0]+linReg.coef_[0][0]*X_max
X_lin = [X_min,X_max]
y_lin = [y_min,y_max]

In [59]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['Fuel Consumption'], y=df['CO2 Emissions'],mode='markers',name='Data'))
fig.add_trace(go.Scatter(x=X_lin, y=y_lin,name='Reg'))
fig.update_layout(title=f"Fuel Consumption by Engine Size", yaxis_title='Fuel Consumption', xaxis_title='CO2 Emissions')
fig.show()

In [78]:
# root MSE on trained
y_pred_train = linReg.predict(X_train)
np.sqrt(mean_squared_error(y_train, y_pred_train))

15.446625976286821

In [79]:
# root MSE
y_pred_test = linReg.predict(X_test)
np.sqrt( mean_squared_error(y_test, y_pred_test))
#y_test.mean() #260.11
#y_test.std() #64.78

11.09092568138768

In [80]:
# R squared 
linReg.score(X_train, y_train)

0.9423805674075392

In [81]:
linReg.score(X_test, y_test)

0.9720156170952511

In [90]:
df[['Fuel Consumption','Engine Size','Cylinders','CO2 Emissions']].corr()

Unnamed: 0,Fuel Consumption,Engine Size,Cylinders,CO2 Emissions
Fuel Consumption,1.0,0.815231,0.822364,0.972181
Engine Size,0.815231,1.0,0.919808,0.820121
Cylinders,0.822364,0.919808,1.0,0.833554
CO2 Emissions,0.972181,0.820121,0.833554,1.0


In [86]:
from sklearn.preprocessing import StandardScaler
scaling=StandardScaler()
df1 = df[['Fuel Consumption','Engine Size','Cylinders','CO2 Emissions']]
# Use fit and transform method 
scaling.fit(df1)
Scaled_data=scaling.transform(df1)

In [89]:
pd.DataFrame(Scaled_data).cov()

Unnamed: 0,0,1,2,3
0,1.001048,0.816086,0.823226,0.9732
1,0.816086,1.001048,0.920772,0.820981
2,0.823226,0.920772,1.001048,0.834428
3,0.9732,0.820981,0.834428,1.001048


In [2]:
# Load dataset from scikit-learn dataset library
# diabetes_X -> Features 
# diabetes_Y -> Labels
diabetes = datasets.load_diabetes()
print('Dataset shape:',diabetes.data.shape)
print('Diabetes labels shape:',diabetes.target.shape)
print('Diabetes feature names:',diabetes.feature_names)

Dataset shape: (442, 10)
Diabetes labels shape: (442,)
Diabetes feature names: ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']


In [3]:
X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.2, random_state=1)

In [4]:
linReg = LinearRegression()

# Fit function is used to train out model on training set
linReg.fit(X_train, y_train)

LinearRegression()

In [5]:
# Let's use our trained model to predict on new unseen data
# ie; test dataset.
y_preds = linReg.predict(X_test) # It returns predicted labels.

# Now, Evaluate our model -> Squared Mean Error
from sklearn.metrics import mean_squared_error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_preds))

Mean squared error: 2992.56


In [6]:
# Compare Actual Labels to the predicted labels
df_result = pd.DataFrame({'Actual': y_test, 'Predicted': y_preds})
df_result

Unnamed: 0,Actual,Predicted
0,78.0,119.138000
1,152.0,110.773913
2,200.0,185.037535
3,59.0,68.010043
4,311.0,171.055675
...,...,...
84,64.0,113.351181
85,107.0,111.447774
86,49.0,98.357841
87,60.0,73.104863


In [7]:
cdf = pd.DataFrame(linReg.coef_, diabetes.feature_names, columns=['Coefficients'])
print(cdf)

     Coefficients
age    -30.621682
sex   -272.254517
bmi    528.844443
bp     327.702690
s1    -581.014130
s2     332.962863
s3     -27.976062
s4     139.284490
s5     665.075210
s6      61.905964


In [13]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=X_test, y=y_preds))
#fig.update_layout(title=f"Feature importances using MDI", yaxis_title='Mean decrease in impurity')
fig.show()