#Import library 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

#Linear Regression model

## Data preparation

In [None]:
#First Example
#Import Dataset
#https://www.kaggle.com/bumba5341/advertisingcsv
df = pd.read_csv(os.path.join('week3', 'Advertising dataset.csv'))
df.head()

In [None]:
# Drop Unnamed: 0 from the dataframe
df.drop('Unnamed: 0',axis = 1,inplace = True)
df.head()

In [None]:
# Show statistcal values
df.describe()

In [None]:
df[['TV']]

In [None]:
#Plot histogram
df[['TV']].plot.hist(figsize=(10,5))
#Plot destribution
df[['TV']].plot.density(figsize=(10,5))

In [None]:
#Plot histogram
df[['Sales']].plot.hist(figsize=(10,5))
#Plot destribution
df[['Sales']].plot.density(figsize=(10,5))

In [None]:
#Plot dataset of TV vs Sales
plt.figure(figsize = (10,5))
plt.title('TV vs Sales')
plt.xlabel('TV')
plt.ylabel('Sales')
plt.plot(df.TV,df.Sales,'or')
plt.show()

## Model building

In [None]:
#import machine learning library (Linear regression model)
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
#Data preparation by spilting data to trainig and testing
x = df.TV.values.reshape(-1,1)
y = df.Sales.values.reshape(-1,1)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 0)

In [None]:
# Show size of training and testing 
print('Size of x training:',len(x_train))
print('Size of y training:',len(y_train))
print('Size of x testing:',len(x_test))
print('Size of y testing:',len(y_test))

In [None]:
#Build linear regression model
model_linear = LinearRegression()
model_linear.fit(x_train,y_train)

#y = theta0 + theta1*x
print("Theta0: ",model_linear.intercept_[0])
print("Theta1: ",model_linear.coef_[0][0])
print("y = {:.4f}+{:.4f}x".format(model_linear.intercept_[0],model_linear.coef_[0][0]))

## Results

In [None]:
y_pred = model_linear.predict(x_train)
y_pred

In [None]:
#Predict on trainig set
y_pred = model_linear.predict(x_train)

#Plot lineae regression line vs actual data on training set
plt.figure(figsize = (10,5))
plt.title('The results on training dataset (TV vs Sales)')
plt.xlabel('TV')
plt.ylabel('Sales')
plt.plot(x_train,y_train,'or',label = 'Actual data')
plt.plot(x_train,y_pred,'-b',label="Linear regression prdiction")
plt.legend(loc="lower right")
plt.show()


In [None]:
# Model measurement on trainig dataset
def MAPE(Y_actual,Y_Predicted):
    mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
    return mape

print("MAE = ",round(metrics.mean_absolute_error(y_train,y_pred),4)) #mean absolute error
print("MSE = ",round(metrics.mean_squared_error(y_train,y_pred),4)) #mean square error
print("RMSE = ",round(np.sqrt(metrics.mean_squared_error(y_train,y_pred)),4)) #root mean square error
print("MAPE = ",round(MAPE(y_train,y_pred),4)) #mean absolute percentage error
print("R-Square = ",round(np.sqrt(metrics.r2_score(y_train,y_pred)),4)) #R-Square

In [None]:
#Predict on testing set
y_pred = model_linear.predict(x_test)

#Plot lineae regression line vs actual data on testing set
plt.figure(figsize = (10,5))
plt.title('The results on testing dataset (TV vs Sales)') 
plt.xlabel('TV')
plt.ylabel('Sales')
plt.plot(x_test,y_test,'or',label = 'Actual data')
plt.plot(x_test,y_pred,'-b',label="Linear regression prdiction")
plt.legend(loc="lower right")
plt.show()


In [None]:
print("MAE = ",round(metrics.mean_absolute_error(y_test,y_pred),4)) #mean absolute error
print("MSE = ",round(metrics.mean_squared_error(y_test,y_pred),4)) #mean square error
print("RMSE = ",round(np.sqrt(metrics.mean_squared_error(y_test,y_pred)),4)) #root mean square error
print("MAPE = ",round(MAPE(y_test,y_pred),4)) #mean absolute percentage error
print("R-Square = ",round(np.sqrt(metrics.r2_score(y_test,y_pred)),4)) #R-Square

#Multiple Linear Regression

## Data preparation

In [None]:
# Statistcal analysis
import seaborn as sns
g = sns.pairplot(df)
g.fig.set_size_inches(8,8)

In [None]:
#import machine learninglibrary
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
#Data preparation
x = df[["TV","Radio"]].values
y = df.Sales.values.reshape(-1,1)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 0)

In [None]:
print('Size of x training:',len(x_train))
print('Size of y training:',len(y_train))
print('Size of x testing:',len(x_test))
print('Size of y testing:',len(y_test))

## Model building

In [None]:
#Build linear regression model
model_MultiLinear = LinearRegression()
model_MultiLinear.fit(x_train,y_train)

#y = theta0 + theta1*x1 +theta2*x2
print("Theta0: ",model_linear.intercept_[0])
print("Theta1: ",model_MultiLinear.coef_[0][0])
print("Theta2: ",model_MultiLinear.coef_[0][1])
print("y = {:.4f}+{:.4f}x1+{:.4f}x2".format(model_MultiLinear.intercept_[0],model_MultiLinear.coef_[0][0],model_MultiLinear.coef_[0][1]))

In [None]:
model_MultiLinear.coef_

## Results

In [None]:
#Predict on trainig set
y_pred = model_MultiLinear.predict(x_train)

In [None]:
# Model measurement on trainig dataset
def MAPE(Y_actual,Y_Predicted):
    mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
    return mape

print("MAE = ",round(metrics.mean_absolute_error(y_train,y_pred),4))
print("MSE = ",round(metrics.mean_squared_error(y_train,y_pred),4))
print("RMSE = ",round(np.sqrt(metrics.mean_squared_error(y_train,y_pred)),4))
print("MAPE = ",round(MAPE(y_train,y_pred),4))
print("R-Square = ",round(np.sqrt(metrics.r2_score(y_train,y_pred)),4))


In [None]:
#Predict on testing set
y_pred = model_MultiLinear.predict(x_test)

In [None]:
print("MAE = ",round(metrics.mean_absolute_error(y_test,y_pred),4))
print("MSE = ",round(metrics.mean_squared_error(y_test,y_pred),4))
print("RMSE = ",round(np.sqrt(metrics.mean_squared_error(y_test,y_pred)),4))
print("MAPE = ",round(MAPE(y_test,y_pred),4))
print("R-Square = ",round(np.sqrt(metrics.r2_score(y_test,y_pred)),4))