In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import plotly.express as py
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot
import cufflinks as cf
init_notebook_mode(connected=True)
cf.go_offline()

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.formula.api as sm

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('delivery_time.csv')
df

Unnamed: 0,Delivery Time,Sorting Time
0,21.0,10
1,13.5,4
2,19.75,6
3,24.0,9
4,29.0,10
5,15.35,6
6,19.0,7
7,9.5,3
8,17.9,10
9,18.75,9


In [6]:
df.columns=["delivery_time","sorting_time"]
df.head(1)

Unnamed: 0,delivery_time,sorting_time
0,21.0,10


In [7]:
df.shape

(21, 2)

In [8]:
plot = py.scatter(df, x="sorting_time", y="delivery_time", trendline='ols',
        labels={'sorting_time': 'Sorting Time', 'delivery_time': 'Delivery Time'})
plot.show()

In [9]:
dist = ff.create_distplot([df['sorting_time']], ['Sorting Time'])
dist.show()

In [10]:
df.iplot(kind='box')

In [11]:
df.iplot(kind='bar')

In [12]:
df['sorting_time'].iplot(kind='hist')

In [13]:
x = df.iloc[:, 1:]
y = df.iloc[:, :1]

In [14]:
x.head(1)

Unnamed: 0,sorting_time
0,10


In [15]:
y.head(1)

Unnamed: 0,delivery_time
0,21.0


In [16]:
r2socreList = []

def getRScore():
    return r2socreList

def addRScore(modelNum, value, intercept, mse):
    obj = {
        'model': modelNum,
        'rsquared': round(value, 3),
        'intercept': round(intercept, 3),
        'mse': round(mse, 3)
    }
    r2socreList.append(obj)

### Using OLS Model
#### Ols(y~x)

In [17]:
model1 = sm.ols("delivery_time~sorting_time",data=df).fit()
mse = mean_squared_error(y, model1.predict())
addRScore('model1', model1.rsquared, model1.pvalues.Intercept, mse)
model1.summary()

0,1,2,3
Dep. Variable:,delivery_time,R-squared:,0.682
Model:,OLS,Adj. R-squared:,0.666
Method:,Least Squares,F-statistic:,40.8
Date:,"Fri, 18 Aug 2023",Prob (F-statistic):,3.98e-06
Time:,01:10:32,Log-Likelihood:,-51.357
No. Observations:,21,AIC:,106.7
Df Residuals:,19,BIC:,108.8
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.5827,1.722,3.823,0.001,2.979,10.186
sorting_time,1.6490,0.258,6.387,0.000,1.109,2.189

0,1,2,3
Omnibus:,3.649,Durbin-Watson:,1.248
Prob(Omnibus):,0.161,Jarque-Bera (JB):,2.086
Skew:,0.75,Prob(JB):,0.352
Kurtosis:,3.367,Cond. No.,18.3


In [18]:
plot=py.scatter(df, x="sorting_time", y="delivery_time", labels={'sorting_time': 'Sorting Time', 'delivery_time': 'Delivery Time'})
plot.add_traces(go.Scatter(x=df['sorting_time'], y=model1.predict(), name='Regression Fit', mode="markers"))
plot.show()

#### Using Log function

In [19]:
model2 = sm.ols("delivery_time~np.log(sorting_time)",data=df).fit()
mse = mean_squared_error(y, model2.predict())
addRScore('model2', model2.rsquared, model2.pvalues.Intercept, mse)

#### Using Sqrt function

In [20]:
model3 = sm.ols("delivery_time~np.sqrt(sorting_time)",data=df).fit()
mse = mean_squared_error(y, model3.predict())
addRScore('model3', model3.rsquared, model3.pvalues.Intercept, mse)

#### Using Cbrt function

In [21]:
model4 = sm.ols("delivery_time~np.cbrt(sorting_time)",data=df).fit()
mse = mean_squared_error(y, model4.predict())
addRScore('model4', model4.rsquared, model4.pvalues.Intercept, mse)

#### Using Power of function

In [22]:
model5=sm.ols("delivery_time~sorting_time+pow(sorting_time,2)",data=df).fit()
mse = mean_squared_error(y, model5.predict())
addRScore('model5', model5.rsquared, model5.pvalues.Intercept, mse)

In [23]:
getRScore()

[{'model': 'model1', 'rsquared': 0.682, 'intercept': 0.001, 'mse': 7.793},
 {'model': 'model2', 'rsquared': 0.695, 'intercept': 0.642, 'mse': 7.47},
 {'model': 'model3', 'rsquared': 0.696, 'intercept': 0.411, 'mse': 7.461},
 {'model': 'model4', 'rsquared': 0.697, 'intercept': 0.015, 'mse': 7.422},
 {'model': 'model5', 'rsquared': 0.693, 'intercept': 0.408, 'mse': 7.519}]

### Looking like Model 4 is Better

In [24]:
model4.summary()

0,1,2,3
Dep. Variable:,delivery_time,R-squared:,0.697
Model:,OLS,Adj. R-squared:,0.681
Method:,Least Squares,F-statistic:,43.79
Date:,"Fri, 18 Aug 2023",Prob (F-statistic):,2.48e-06
Time:,01:10:32,Log-Likelihood:,-50.844
No. Observations:,21,AIC:,105.7
Df Residuals:,19,BIC:,107.8
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-11.5792,4.332,-2.673,0.015,-20.647,-2.511
np.cbrt(sorting_time),15.7673,2.383,6.618,0.000,10.780,20.754

0,1,2,3
Omnibus:,4.963,Durbin-Watson:,1.351
Prob(Omnibus):,0.084,Jarque-Bera (JB):,3.05
Skew:,0.896,Prob(JB):,0.218
Kurtosis:,3.525,Cond. No.,16.4


In [25]:
plot=py.scatter(df, x="sorting_time", y="delivery_time", labels={'sorting_time': 'Sorting Time', 'delivery_time': 'Delivery Time'})
plot.add_trace(go.Scatter(x=df['sorting_time'], y=model4.fittedvalues, name='Regression Fit', mode="markers"))
plot.show()

In [26]:
df['predicted'] = model4.fittedvalues

In [27]:
df.head(1)

Unnamed: 0,delivery_time,sorting_time,predicted
0,21.0,10,22.390381


### Prediction Using New Data Point

In [28]:
newdata = pd.Series([5, 7, 9])
newdata

0    5
1    7
2    9
dtype: int64

In [29]:
newdf = pd.DataFrame(newdata, columns = ['sorting_time'])
newdf

Unnamed: 0,sorting_time
0,5
1,7
2,9


In [30]:
newdf['Predicted'] = model4.predict(newdf)
newdf

Unnamed: 0,sorting_time,Predicted
0,5,15.382474
1,7,18.582527
2,9,21.21807
