In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as st
import plotly.express as px
import matplotlib.pyplot as plt
pd.set_option('display.float_format', lambda x: '%.5f' % x)

from scipy.stats import shapiro, kstest, normaltest
from statsmodels.stats.diagnostic import lilliefors

from tqdm import tqdm
import glob
import sys  
sys.path.insert(0, '../src')

# importing user-defined functions from udf_eda.py
import udf_eda as udf
import udf_timeseries as udf_ts

In [2]:
file_names = glob.glob('../../../data/input/11_Dataset/**/ODP *.xlsx')

def Filter(string, substr):
    return [str for str in string if
             any(sub in str for sub in substr)]
      
# Driver code
substr = ['1695', '1700', '1776', '1779']
file_names = Filter(file_names, substr)
len(file_names)

4

In [3]:
df = udf.read_bind(file_names)

# if any blank columns are created by accident in a spreadsheet software, which wouldn't have any column name, we remove such columns
df = df[df.columns.drop(list(df.filter(regex='Unnamed:')))]
print("How many NaN values exist in the data: ", df.isna().sum().sum())
print("Shape of the data: ",df.shape)

100%|██████████| 4/4 [00:12<00:00,  3.19s/it]

The following batches have incompatible data:  []
# of batches read:  4
Missing batches, if any: set()
How many NaN values exist in the data:  0
Shape of the data:  (5149, 265)





In [4]:
# Extracting the Resa variable from produzione_CStOA_2021_ed12.xlsx
tdf = pd.read_excel('../../../data/input/11_Dataset/produzione_CStOA_2021_ed12.xlsx', sheet_name="dati-produzione", header=1)
tdf = tdf[['O.D.P.','Resa']]
tdf.dropna(axis=0, how='any', inplace=True)
tdf['O.D.P.'] =tdf['O.D.P.'].astype(int).astype(str).str[-4:]
tdf.columns = ['id', 'result']
tdf.result = round(tdf.result,3)
df = tdf.merge(df, how='inner')

tdf = df.groupby(['id'])['timeseries'].agg(['min', 'max']).reset_index()
tdf.columns = ['id', 'start_date', 'end_date']
tdf['processing_time_mins'] = ((tdf['end_date'] - tdf['start_date'])/pd.Timedelta(minutes = 1))+1
df = tdf.merge(df, how = 'right')
df.insert(5, 'timestamp_index', df.groupby('id').cumcount())

In [5]:
df[['id', 'start_date', 'end_date', 'processing_time_mins', 'result']].drop_duplicates().reset_index(drop =True)
df.insert(6, 'progress_perc', round(((df.timestamp_index / df.processing_time_mins)*100),1).astype(float))

In [6]:
# df = df.groupby(['id', 'start_date', 'end_date', 'processing_time_mins', 'result', 'progress_perc']
#             )[df.columns[8:].tolist()].mean().reset_index()

In [7]:
df[['id', 'processing_time_mins', 'result']].drop_duplicates()

Unnamed: 0,id,processing_time_mins,result
0,1695,1135.0,0.786
1135,1700,1251.0,0.787
2386,1776,1228.0,0.689
3614,1779,1535.0,0.69


In [8]:
df.head()

Unnamed: 0,id,start_date,end_date,processing_time_mins,result,timestamp_index,progress_perc,timeseries,101LI636,101WI610,...,108PI659,108PI662,108PI663,108FI653,108FI657,108FI665,108FI669,108FI673,108FI677,108FI681
0,1695,2021-09-01 17:25:00,2021-09-02 12:19:00,1135.0,0.786,0,0.0,2021-09-01 17:25:00,85586.5,63971.12109,...,0.14457,-0.01054,0.1108,0.15828,0.09526,0.08392,1.00374,1.76228,1.89321,0.10681
1,1695,2021-09-01 17:25:00,2021-09-02 12:19:00,1135.0,0.786,1,0.1,2021-09-01 17:26:00,85271.29688,63946.65625,...,3.95245,6.01123,3.91628,2.18765,2.54391,1.6652,2.0043,1.62186,2.1035,0.10298
2,1695,2021-09-01 17:25:00,2021-09-02 12:19:00,1135.0,0.786,2,0.2,2021-09-01 17:27:00,84996.61719,63928.17969,...,4.88801,6.95339,4.86707,2.64019,3.24634,1.87052,2.40283,2.20951,2.42574,0.10224
3,1695,2021-09-01 17:25:00,2021-09-02 12:19:00,1135.0,0.786,3,0.3,2021-09-01 17:28:00,84872.57812,63919.46875,...,5.41275,7.44928,5.39329,2.65175,3.08321,2.02452,2.47971,2.32798,2.99701,0.09551
4,1695,2021-09-01 17:25:00,2021-09-02 12:19:00,1135.0,0.786,4,0.4,2021-09-01 17:29:00,84929.36719,63936.00781,...,5.62181,7.66678,5.60771,2.68541,3.0764,2.01615,2.56464,2.32909,2.55896,0.104


In [9]:
df.shape

(5149, 271)

In [10]:
df = df.astype({'id': 'int32'})
udf.describe(df).T


Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std,dtype,size,perc_null
id,5149.00000,,,,1740.57448,1695.00000,1700.00000,1776.00000,1779.00000,1779.00000,39.97012,int32,5149,0.00000
start_date,5149,,,,2021-09-17 15:13:55.338900736,2021-09-01 17:25:00,2021-09-07 22:30:00,2021-09-26 02:32:00,2021-09-30 12:35:00,2021-09-30 12:35:00,,datetime64[ns],5149,0.00000
end_date,5149,,,,2021-09-18 12:57:31.916876800,2021-09-02 12:19:00,2021-09-08 19:20:00,2021-09-26 22:59:00,2021-10-01 14:09:00,2021-10-01 14:09:00,,datetime64[ns],5149,0.00000
processing_time_mins,5149.00000,,,,1304.60963,1135.00000,1228.00000,1251.00000,1535.00000,1535.00000,155.78145,float64,5149,0.00000
result,5149.00000,,,,0.73449,0.68900,0.69000,0.69000,0.78600,0.78700,0.04836,float64,5149,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108FI665,5149.00000,,,,1.45548,0.06364,0.07915,2.17780,2.83657,3.96615,1.35140,float64,5149,0.00000
108FI669,5149.00000,,,,1.28960,0.05680,0.07998,1.65241,2.52996,5.42102,1.19886,float64,5149,0.00000
108FI673,5149.00000,,,,0.87518,0.08774,0.10276,1.13768,1.59088,4.41289,0.77891,float64,5149,0.00000
108FI677,5149.00000,,,,0.57714,0.08607,0.10241,0.74957,0.93557,4.76523,0.52129,float64,5149,0.00000


Let's perform some statistical tests between variables of each batch, the following are a few points to make note of before selecting which tests are are applicable based on the data, keep in mind we are testing between batches:
1. The samples are independent, the reason being, the values in one sample reveal no information about those of the other sample, then the samples are independent.
2. An unpaired t-test is used to compare the mean between two independent groups. You use an unpaired t-test when you are comparing two separate groups with equal variance otherwise a Welch’s test should be used.
    - To test if the variance of two groups are equal we use an F-test. This test can be a two-tailed test or a one-tailed test. The two-tailed version tests against the alternative that the variances are not equal. 
3. 

In [11]:
fdf = df.select_dtypes(include=['int32', 'int64', 'float32', 'float64'])

In [12]:
fdf

Unnamed: 0,id,processing_time_mins,result,timestamp_index,progress_perc,101LI636,101WI610,306LI606,101AI635,101AI605,...,108PI659,108PI662,108PI663,108FI653,108FI657,108FI665,108FI669,108FI673,108FI677,108FI681
0,1695,1135.00000,0.78600,0,0.00000,85586.50000,63971.12109,0.00000,5.22612,5.47675,...,0.14457,-0.01054,0.11080,0.15828,0.09526,0.08392,1.00374,1.76228,1.89321,0.10681
1,1695,1135.00000,0.78600,1,0.10000,85271.29688,63946.65625,0.00000,5.22721,5.47792,...,3.95245,6.01123,3.91628,2.18765,2.54391,1.66520,2.00430,1.62186,2.10350,0.10298
2,1695,1135.00000,0.78600,2,0.20000,84996.61719,63928.17969,0.00000,5.22659,5.47856,...,4.88801,6.95339,4.86707,2.64019,3.24634,1.87052,2.40283,2.20951,2.42574,0.10224
3,1695,1135.00000,0.78600,3,0.30000,84872.57812,63919.46875,0.00000,5.22568,5.47795,...,5.41275,7.44928,5.39329,2.65175,3.08321,2.02452,2.47971,2.32798,2.99701,0.09551
4,1695,1135.00000,0.78600,4,0.40000,84929.36719,63936.00781,0.00000,5.22949,5.48159,...,5.62181,7.66678,5.60771,2.68541,3.07640,2.01615,2.56464,2.32909,2.55896,0.10400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5144,1779,1535.00000,0.69000,1530,99.70000,60763.29297,95326.14844,95639.71875,5.60712,5.55802,...,0.15067,-0.01466,0.12129,0.14143,0.10226,0.08314,0.07426,0.10782,0.10624,0.10533
5145,1779,1535.00000,0.69000,1531,99.70000,62156.03516,95327.51562,95663.68750,5.60755,5.55903,...,0.15719,-0.01017,0.12633,0.08470,0.09674,0.07769,0.08226,0.10269,0.10118,0.09733
5146,1779,1535.00000,0.69000,1532,99.80000,63562.53906,95324.79688,95694.00781,5.60899,5.55909,...,0.15133,-0.01360,0.12668,0.05853,0.09251,0.07910,0.08278,0.10303,0.10869,0.10320
5147,1779,1535.00000,0.69000,1533,99.90000,64914.76953,95319.53125,95803.98438,5.60769,5.55997,...,0.15442,-0.01549,0.12141,0.05318,0.09679,0.08100,0.07557,0.10582,0.10158,0.09076


In [13]:
fdf_columns = fdf.columns[5:].tolist()
fdf_ids = fdf.id.unique().tolist()
data = []
for i in fdf_ids:
    for j in fdf_columns:
        # print("id: %s , col: %s" % (i,j))
        x = fdf[fdf['id'] == i][j]
        # print(f'{"normal"}: {"Not Gaussian" if normaltest(x.values,)[1]<0.05 else "Gaussian"}  {normaltest(x.values)}')
        # print(f'{"KS test"}: {"Not Gaussian" if kstest(x.values,"norm")[1]<0.05 else "Gaussian"}  {kstest(x.values,"norm")}')
        # print(f'{"shapiro"}: {"Not Gaussian" if shapiro(x.values)[1]<0.05 else "Gaussian"}  {shapiro(x.values)}')
        normal      = 0 if normaltest(x.values,)[1]<0.05 else 1
        ks          = 0 if kstest(x.values,"norm")[1]<0.05 else 1
        lilli       = 0 if lilliefors(x.values)[1]<0.05 else 1
        shap        = 0 if shapiro(x.values)[1]<0.05 else 1
        cols        = ['id', 'variable', 'normaltest', 'kstest', 'lilliefors', 'shapiro']
        values      = [i, j, normal, ks, lilli, shap]
        zipped      = zip(cols, values)
        dictionary  = dict(zipped)
        data.append(dictionary)
norm_test_results = pd.DataFrame(data)

  z = (x - x.mean()) / x.std(ddof=1)


In [14]:
norm_test_results[(norm_test_results['shapiro'] == 1)]

Unnamed: 0,id,variable,normaltest,kstest,lilliefors,shapiro
2,1695,306LI606,0,0,1,1
8,1695,306LI606.1,0,0,1,1
12,1695,158PIC678_823,0,0,1,1
176,1695,118FI913,0,0,1,1
225,1695,107FI696A,1,0,1,1
227,1695,107FI689A,1,0,1,1
228,1695,107FI693A,1,0,1,1
255,1695,108FI681,1,0,1,1
268,1700,158PIC678_823,0,0,1,1
432,1700,118FI913,0,0,1,1


In [15]:
myseries = norm_test_results[(norm_test_results['shapiro'] == 1)]# | (norm_test_results['lilliefors'] == 1)]
myseries = myseries.groupby('variable').id.nunique()
myseries = myseries[myseries == 4].index.tolist()
print('The following are the variables which follow gaussian distribution in all four batches: \n', myseries)

# norm_test_results[norm_test_results['lilliefors'] == 1]

The following are the variables which follow gaussian distribution in all four batches: 
 ['108FI681', '118FI913', '158PIC678_823']


Now lets check if the variance is equal or not and use unpaired t-test or Welch’s test accordingly

In [19]:
import scipy
def f_test(x, y):
    x = np.array(x)
    y = np.array(y)
    f = np.var(x, ddof=1)/np.var(y, ddof=1) #calculate F test statistic 
    dfn = x.size-1 #define degrees of freedom numerator 
    dfd = y.size-1 #define degrees of freedom denominator 
    p = 1-scipy.stats.f.cdf(f, dfn, dfd) #find p-value of F test statistic 
    return f, p

for i in fdf_ids:
    for j in fdf_ids:
        for k in myseries:
            if i == j:
                continue
            else:
                x = fdf[fdf['id'] == i][k]
                y = fdf[fdf['id'] == j][k]
                F, p = f_test(x,y)
                if p:
                    print("F-test for variable %s between batches %s and %s is %s"% (k, i, j, p))
                    flag = False if p<0.05 else True
                    print("unpaired t-test for the same batches is: ", scipy.stats.ttest_ind(i,j,equal_var=flag))
                    print("Paired t-test for the same batches is: ", scipy.stats.ttest_rel(i,j))

F-test for variable 108FI681 between batches 1695 and 1700 is 0.4472693116689743
unpaired t-test for the same batches is:  Ttest_indResult(statistic=nan, pvalue=nan)
Paired t-test for the same batches is:  Ttest_relResult(statistic=nan, pvalue=nan)
F-test for variable 118FI913 between batches 1695 and 1700 is nan
unpaired t-test for the same batches is:  Ttest_indResult(statistic=nan, pvalue=nan)
Paired t-test for the same batches is:  Ttest_relResult(statistic=nan, pvalue=nan)
F-test for variable 158PIC678_823 between batches 1695 and 1700 is nan
unpaired t-test for the same batches is:  Ttest_indResult(statistic=nan, pvalue=nan)
Paired t-test for the same batches is:  Ttest_relResult(statistic=nan, pvalue=nan)
F-test for variable 108FI681 between batches 1695 and 1776 is 0.1689856429286044
unpaired t-test for the same batches is:  Ttest_indResult(statistic=nan, pvalue=nan)
Paired t-test for the same batches is:  Ttest_relResult(statistic=nan, pvalue=nan)
F-test for variable 118FI913 

  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)
  f = np.var(x, ddof=1)/np.var(y, ddof=1) #calculate F test statistic


In the above output, two key highlights are:
- F-test for variable 108FI681 between batches 1695 and 1779 is (1.1241300058307309, 0.01697610895763868)
- F-test for variable 108FI681 between batches 1700 and 1779 is (1.11558008978606, 0.02095289512189924)

The above two cases reject Null hypothesis, which means that both of them do not have equal variance, therefore we should use Welch test, however when the batches are interchanged the results of the test differ

- F-test for variable 108FI681 between batches 1779 and 1695 is (0.8895768236886454, 0.9830238910423632)
- F-test for variable 108FI681 between batches 1779 and 1700 is (0.8963946283693309, 0.9790471048780995)

Let's use unpaired t-test now and check the results: