In [None]:
# Multi Linear Regression 

In this example we will consider sales (no of items sold) based on:

- Social Medial Advertiseing expense (in USD 000)

- Print Medial Advertiseing expense (in USD 000)

- Website Maintenance expense (in USD 000)

- Call center average call waiting time 


Build a linear regression model to predict 'Sales' (no of items sold) using using above factors

In [None]:
import pandas as pd
#import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# To visualise in the notebook
%matplotlib inline

In [None]:
multi_df = pd.read_csv("MultiSales.csv")

Now, let's check the structure of the advertising dataset.

In [None]:
# Display the first 5 rows
multi_df.head()

In [None]:
# Let's check the columns
multi_df.info()

In [None]:
# Check the shape of the DataFrame (no. of rows and no. of columns)
multi_df.shape

In [None]:
# Let's look at some statistical information about the dataframe.
multi_df.describe()

In [None]:
# Visualising Data Using Seaborn

In [None]:


#axes = pd.plotting.scatter_matrix(mpg_df_attr)
#plt.tight_layout()
#plt.savefig('d:\mpg_pairpanel.png')

sns.pairplot(multi_df)
#sns.pairplot(multi_df, diag_kind='kde')   # to plot density curve instead of histogram


In [None]:
plt.figure(figsize = (5,5))
sns.heatmap(multi_df.corr(),annot = True)

In [None]:
# Perfroming Simple Linear Regression

In [None]:
# Putting feature variable to X

X = multi_df.drop(["id","Sales"], axis=1)
print(X.shape)
# Print the first 5 rows
X.head()

In [None]:
# Putting response variable to y
y = multi_df[['Sales']]

# Print the first 5 rows
y.head()

In [None]:
## Splitting Data into Training and Testing Sets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3 , random_state=5)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
## Performing Linear Regression

In [None]:
# import LinearRegression from sklearn
from sklearn.linear_model import LinearRegression

# Representing LinearRegression as lr(Creating LinearRegression Object)
lr_model = LinearRegression()

# Fit the model using lr.fit()
lr_model.fit(X_train, y_train)

In [None]:
## Coefficients Calculation

In [None]:
# Print the intercept and coefficients
print("Intercept: ", lr_model.intercept_)
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, lr_model.coef_[0][idx]))


In [None]:
## Predictions

In [None]:
# Making predictions on the testing set
y_pred = lr_model.predict(X_test)

In [None]:
plt.scatter(y_pred, y_pred - y_test) 
plt.xlabel('Y Predicted')
plt.ylabel('Residuals')
## plotting line for zero residual error 
plt.hlines(y = 0, xmin = min(y_pred), xmax = max(y_pred), linewidth = 2)

In [None]:
print("Train")
print(lr_model.score(X_train, y_train))
print("Test")
print(lr_model.score(X_test, y_test))

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
# following is not required since model score already displayed. Shown only for training
r_squared = r2_score(y_test, y_pred)

In [None]:
print('Mean_Squared_Error :' ,mse)
print('r_square_value :',r_squared)

In [None]:
### 
### for additional study reference: 
### Code for VIF BEGIN

In [None]:
#### Define function to copute VIF value

#import statsmodels.formula.api as smf
#def calculate_vif(variables):
#    Xs = variables
#    X_column_names = variables.columns
#    for i in range(0, X_column_names.shape[0]):
#        y = Xs[X_column_names[i]] 
#        x = Xs[X_column_names.drop(X_column_names[i])]
#        r_square = smf.ols(formula="y~x", data=Xs).fit().rsquared  
#        vif = round(1 / (1 - r_square),2)
#        print (X_column_names[i], " VIF = " , vif)

In [None]:
#### Compute VIF score for each predictor variable

#calculate_vif(variables=multi_df[['Social','Print','Maint','Waiting']])

In [None]:
### 
### Code for VIF END
###

In [None]:

#extra code to view actual and predicted y in the datafeame

#temp_df = pd.concat([X_test, y_test], axis=1)
#temp_df.head()

In [None]:
#extra code to view actual and predicted y in the datafeame

#temp_df['Predicted Sales'] = y_pred
#temp_df


#### STATSMODELS
Use OLS from Statsmodels to determine which features can be dropped

In [None]:
# Another way to code. This is NOT needed. Only for reference IF INTERESTED

### import statsmodels.api as sm
### X_train_sm = X_train
#Unlike SKLearn, statsmodels don't automatically fit a constant, 
#so you need to use the method sm.add_constant(X) in order to add a constant. 
### X_train_sm = sm.add_constant(X_train_sm)
### X_train_sm.shape

In [None]:
data_train = pd.concat([X_train, y_train], axis=1)
data_train.head()

In [None]:
# create a fitted model in one line
import statsmodels.formula.api as smf
lm1 = smf.ols(formula= 'Sales ~ Social+Print+Maint+Waiting', data = data_train).fit()

# print the coefficients
lm1.params

In [None]:
print(lm1.summary())

# Iteration 2


In [None]:
X = multi_df.drop(["id","Sales","Maint"], axis=1)
print(X.shape)

In [None]:
# Putting response variable to y
y = multi_df[['Sales']]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3 , random_state=5)


In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# Fit the model using lr.fit()
lr_model.fit(X_train, y_train)

In [None]:
# Print the intercept and coefficients
print("Intercept: ", lr_model.intercept_)
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, lr_model.coef_[0][idx]))

In [None]:
# Making predictions on the testing set
y_pred = lr_model.predict(X_test)

In [None]:
plt.scatter(y_pred, y_pred - y_test) 
plt.xlabel('Y Predicted')
plt.ylabel('Residuals')
## plotting line for zero residual error 
plt.hlines(y = 0, xmin = min(y_pred), xmax = max(y_pred), linewidth = 2)

In [None]:
print("Train")
print(lr_model.score(X_train, y_train))
print("Test")
print(lr_model.score(X_test, y_test))

In [None]:
# This is NOT  needed. This is another way of coding for reference, IF INTERESTED

### X_train_sm = X_train
#Unlike SKLearn, statsmodels don't automatically fit a constant, 
#so you need to use the method sm.add_constant(X) in order to add a constant. 
### X_train_sm = sm.add_constant(X_train_sm)
### X_train_sm.shape

In [None]:
data_train = pd.concat([X_train, y_train], axis=1)
data_train.head()


In [None]:
lm1 = smf.ols(formula= 'Sales ~ Social+Print+Waiting', data = data_train).fit()

# print the coefficients
lm1.params

In [None]:
print(lm1.summary())


### Iteration 3 - Let's try scaling data using Standard Scalar

In [None]:
# Iteration 3 - Standard Scalar
# -----------
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)  
X_test_scaled = scaler.transform(X_test)  

In [None]:
lr_model = LinearRegression()

# Fit the model using lr.fit()
lr_model.fit(X_train_scaled, y_train)

# Print the intercept and coefficients
print("Intercept: ", lr_model.intercept_)
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, lr_model.coef_[0][idx]))

In [None]:
# Making predictions on the testing set
y_pred = lr_model.predict(X_test_scaled)

In [None]:
#plt.scatter(y_test,y_pred)
#y_cordinate = y_test
#plt.plot(y_test, y_cordinate, 'r')
#plt.xlabel('Y Test')
#plt.ylabel('Predicted Y')

In [None]:
print("Train")
print(lr_model.score(X_train_scaled, y_train))
print("Test")
print(lr_model.score(X_test_scaled, y_test))

In [None]:
plt.scatter(y_pred, y_pred - y_test) 
plt.xlabel('Y Predicted')
plt.ylabel('Residuals')
## plotting line for zero residual error 
plt.hlines(y = 0, xmin = min(y_pred), xmax = max(y_pred), linewidth = 2)