## Import packages

In [3]:
import pandas as pd
import numpy as np
# run this cell
import pandas.util.testing as tm
# LinearRegression()  model can be used from linear_model module
from sklearn import linear_model

# OSL() model can be used from statmodels package
import statsmodels.api as sm


# we will perform sampling using train_test_split module for training and testin set
# and we will use GridSearchCV to find the best parameters for SGDRegressor()
from sklearn.model_selection import train_test_split

# we will evaluate our models using RMSE(mean_squared_error) and Determination coefficient(r2_score)
from sklearn.metrics import mean_squared_error, r2_score

# in case you will use polinominal features
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

## Load data

In [11]:
data = pd.read_csv("Lemonade_new.csv")

In [12]:
data.head()

Unnamed: 0,Date,Day,Temperature,Rainfall,Flyers,Price,Sales,Season,Revenue
0,2017-01-01,Sunday,27.0,2.0,15,0.3,10,Winter,3.0
1,2017-01-02,Monday,28.9,1.33,15,0.3,13,Winter,3.9
2,2017-01-03,Tuesday,34.5,1.33,27,0.3,15,Winter,4.5
3,2017-01-04,Wednesday,44.1,1.05,28,0.3,17,Winter,5.1
4,2017-01-05,Thursday,42.4,1.0,33,0.3,18,Winter,5.4


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         365 non-null    object 
 1   Day          365 non-null    object 
 2   Temperature  365 non-null    float64
 3   Rainfall     365 non-null    float64
 4   Flyers       365 non-null    int64  
 5   Price        365 non-null    float64
 6   Sales        365 non-null    int64  
 7   Season       365 non-null    object 
 8   Revenue      365 non-null    float64
dtypes: float64(4), int64(2), object(3)
memory usage: 25.8+ KB


In [15]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Temperature,365.0,60.731233,16.196266,15.1,49.7,61.1,71.3,102.9
Rainfall,365.0,0.826603,0.273171,0.47,0.65,0.74,0.91,2.5
Flyers,365.0,40.284932,13.178651,9.0,31.0,39.0,49.0,80.0
Price,365.0,0.333973,0.075206,0.3,0.3,0.3,0.3,0.5
Sales,365.0,25.323288,6.893589,7.0,20.0,25.0,30.0,43.0
Revenue,365.0,8.722466,4.017536,2.1,6.0,7.5,9.3,21.5


## Divide data into train and test sets

In [17]:
X = data[['Temperature', 'Rainfall', 'Flyers', 'Price']]
y = data[['Sales']]

In [18]:
X.head(), y.head()

(   Temperature  Rainfall  Flyers  Price
 0         27.0      2.00      15    0.3
 1         28.9      1.33      15    0.3
 2         34.5      1.33      27    0.3
 3         44.1      1.05      28    0.3
 4         42.4      1.00      33    0.3,
    Sales
 0     10
 1     13
 2     15
 3     17
 4     18)

In [19]:
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 17)

In [20]:
x_train.shape, y_train.shape

((255, 4), (255, 1))

In [21]:
x_test.shape, y_test.shape

((110, 4), (110, 1))

## Train the model

In [23]:
OLS_model = sm.OLS(y_train, x_train).fit()

In [24]:
OLS_model.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared (uncentered):,0.999
Model:,OLS,Adj. R-squared (uncentered):,0.999
Method:,Least Squares,F-statistic:,48980.0
Date:,"Mon, 19 Apr 2021",Prob (F-statistic):,0.0
Time:,15:16:11,Log-Likelihood:,-342.96
No. Observations:,255,AIC:,693.9
Df Residuals:,251,BIC:,708.1
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Temperature,0.3946,0.007,58.041,0.000,0.381,0.408
Rainfall,-0.5431,0.165,-3.288,0.001,-0.868,-0.218
Flyers,0.0202,0.007,2.700,0.007,0.005,0.035
Price,2.8773,0.965,2.983,0.003,0.978,4.777

0,1,2,3
Omnibus:,63.338,Durbin-Watson:,2.014
Prob(Omnibus):,0.0,Jarque-Bera (JB):,12.157
Skew:,0.035,Prob(JB):,0.00229
Kurtosis:,1.933,Cond. No.,1240.0


In [25]:
LR_model = linear_model.LinearRegression()

In [26]:
LR_model.fit(x_train, y_train)

LinearRegression()

In [27]:
LR_model.coef_

array([[ 0.35393314, -3.09002359,  0.01807888,  3.08526455]])

In [28]:
LR_model.intercept_

array([4.59888881])

In [30]:
y_pred = LR_model.predict(x_test)

In [32]:
data.corr()

Unnamed: 0,Temperature,Rainfall,Flyers,Price,Sales,Revenue
Temperature,1.0,-0.902859,0.797719,0.504709,0.989832,0.839314
Rainfall,-0.902859,1.0,-0.747389,-0.383775,-0.909214,-0.712513
Flyers,0.797719,-0.747389,1.0,0.405437,0.805183,0.6787
Price,0.504709,-0.383775,0.405437,1.0,0.512908,0.884179
Sales,0.989832,-0.909214,0.805183,0.512908,1.0,0.849568
Revenue,0.839314,-0.712513,0.6787,0.884179,0.849568,1.0


In [33]:
data["Rainfall^2"] = data.Rainfall**2