In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as st
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option("display.float_format", lambda x: "%.5f" % x)

from tqdm import tqdm
import glob
import sys

sys.path.insert(0, "../../nagesh_kommuri/src")

# importing user-defined functions from udf_eda.py
import udf_eda as udf
import udf_timeseries as udf_ts


In [2]:
file_names = glob.glob("../../../data/input/11_Dataset/**/ODP *.xlsx")


In [3]:
file_names


['../../../data/input/11_Dataset\\20211001\\ODP 100001769_dati_BHV_CFF_NF_EXT .xlsx',
 '../../../data/input/11_Dataset\\20211001\\ODP 100001770_dati_BHV_CFF_NF_EXT .xlsx',
 '../../../data/input/11_Dataset\\20211001\\ODP 100001771_dati_BHV_CFF_NF_EXT .xlsx',
 '../../../data/input/11_Dataset\\20211001\\ODP 100001772_dati_BHV_CFF_NF_EXT .xlsx',
 '../../../data/input/11_Dataset\\20211001\\ODP 100001773_dati_BHV_CFF_NF_EXT .xlsx',
 '../../../data/input/11_Dataset\\20211001\\ODP 100001774_dati_BHV_CFF_NF_EXT .xlsx',
 '../../../data/input/11_Dataset\\20211001\\ODP 100001775_dati_BHV_CFF_NF_EXT.xlsx',
 '../../../data/input/11_Dataset\\20211001\\ODP 100001776_dati_BHV_CFF_NF_EXT.xlsx',
 '../../../data/input/11_Dataset\\20211001\\ODP 100001777_dati_BHV_CFF_NF_EXT.xlsx',
 '../../../data/input/11_Dataset\\20211004\\ODP 100001700_dati_BHV_CFF_NF_EXT .xlsx',
 '../../../data/input/11_Dataset\\20211004\\ODP 100001701_dati_BHV_CFF_NF_EXT .xlsx',
 '../../../data/input/11_Dataset\\20211004\\ODP 100001702

In [4]:
features = glob.glob("../../../data/input/ODP *.xlsx")


In [5]:
len(file_names)


109

In [6]:
features


['../../../data/input\\ODP 100002203-Parametri ridotti.xlsx']

In [7]:
_, _, _, featues = udf.read_bind(features)


100%|██████████| 1/1 [00:03<00:00,  3.44s/it]

The following batches have incompatible data:  []
# of batches read:  1
Missing batches, if any: set()
How many NaN values exist in the merged data:  1187
Shape of the merged data:  (1190, 169)





In [8]:
featues.drop("TAG", axis=1, inplace=True)


In [9]:
featues.columns[2:].values


array(['101LI636', '101WI610', '306LI606', '101AI635', '101AI605',
       '101TI607', '101TI637', '306TI604', '158PI678', '158FI652',
       '158FI654', '158FI656', '158FI666', '158FI667', '158FI679P',
       '158TI650', '159FI652', '159FI654', '159FI656', '159FI666',
       '159FI667', '159TI650', '160FI652', '160FI654', '160FI656',
       '160FI666', '160FI667', '160TI650', '161FI652', '161FI654',
       '161FI656', '161FI666', '161FI667', '161TI650', '162FI652',
       '162FI654', '162FI656', '162FI666', '162FI667', '162TI650',
       '163FI652', '163FI654', '163FI656', '163FI666', '163FI667',
       '163TI650', '164FI652', '164FI654', '164FI656', '164FI666',
       '164FI667', '164TI650', '165FI652', '165FI654', '165FI656',
       '165FI666', '165FI667', '165TI650', '158CAL4551', '158CAL4552',
       '159CAL4551', '159CAL4552', '160CAL4551', '160CAL4552',
       '161CAL4551', '161CAL4552', '162CAL4551', '162CAL4552',
       '163CAL4551', '163CAL4552', '164CAL4551', '164CAL4552',
  

In [10]:
data_BHV_CFF, data_NF, data_EXT, appended_data = udf.read_bind(file_names)


100%|██████████| 109/109 [05:34<00:00,  3.07s/it]


The following batches have incompatible data:  ['1576', '1778']
# of batches read:  101
Missing batches, if any: {'1573', '1355', '1067', '1777', '1262', '0947'}
How many NaN values exist in the merged data:  8936962
Shape of the merged data:  (87436, 364)


In [11]:
df = appended_data.copy()


In [12]:
# if any blank columns are created by accident in a spreadsheet software, which wouldn't have any column name, we remove such columns
df = df[df.columns.drop(list(df.filter(regex="Unnamed:")))]
print("How many NaN values exist in the data: ", df.isna().sum().sum())
print("Shape of the data: ", df.shape)


How many NaN values exist in the data:  8936962
Shape of the data:  (87436, 364)


In [13]:
df_id = df[["id", "timeseries"]]


In [14]:
df = df[featues.columns[2:].values]


In [15]:
print("How many NaN values exist in the data: ", df.isna().sum().sum())


How many NaN values exist in the data:  186994


In [16]:
# df = df[df.columns.drop(list(df.filter(regex='.1')))]
# df = df[df.columns.drop(list(df.filter(regex='.2')))]
# df = df[df.columns.drop(list(df.filter(regex='.3')))]


In [17]:
df.shape


(87436, 166)

In [18]:
df = pd.concat([df_id, df], axis=1)


In [19]:
df


Unnamed: 0,id,timeseries,101LI636,101WI610,306LI606,101AI635,101AI605,101TI607,101TI637,306TI604,...,108TI660,108TI664,108PI650,108FI653,108FI657,108FI665,108FI669,108FI673,108FI677,108FI681
0,1769,2021-09-16 02:45:00,94743.25000,96197.11719,28983.25781,5.12101,5.20942,11.48276,13.52487,20.94076,...,20.63876,20.50171,0.04491,0.11542,0.09313,0.08456,0.08083,0.10902,0.10963,0.09213
1,1769,2021-09-16 02:46:00,94927.07031,96202.61719,28955.89844,5.11885,5.20700,11.48991,13.51579,20.94727,...,20.68064,20.49404,0.04877,0.15048,0.10186,0.08769,0.08899,0.11003,0.11538,0.10532
2,1769,2021-09-16 02:47:00,94470.82813,96197.37500,28923.41211,5.12032,5.21198,11.48278,13.51379,20.95803,...,20.67333,20.47291,0.04442,0.04930,0.09773,0.08979,0.09198,0.10886,0.11169,0.10504
3,1769,2021-09-16 02:48:00,94334.87500,96202.85156,28981.38477,5.12036,5.21064,11.48057,13.52005,20.96014,...,20.65566,20.50415,0.05493,0.04008,0.10604,0.08503,0.09302,0.10722,0.10888,0.10001
4,1769,2021-09-16 02:49:00,94146.34375,96209.53906,28896.72266,5.12075,5.21054,11.48902,13.52251,20.96883,...,20.67343,20.52964,0.05217,0.01833,0.09868,0.09563,0.09524,0.10781,0.11302,0.09227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,1197,2021-06-23 17:05:00,2699.54102,88846.57031,0.00000,3.33094,5.67894,19.93162,16.77656,22.34291,...,24.97626,24.85718,0.05493,0.09906,0.10354,0.09034,0.08719,0.10752,0.11198,0.10525
245,1197,2021-06-23 17:10:00,2733.27173,90839.25781,0.00000,3.33267,5.68546,19.77357,16.83838,22.77593,...,24.99808,24.88873,0.05810,0.06977,0.10237,0.09058,0.10104,0.10692,0.11204,0.10247
246,1197,2021-06-23 17:15:00,2759.01514,92892.02344,20.05767,3.34969,5.69014,19.62809,16.90186,22.98542,...,25.02415,24.87848,0.05493,0.10907,0.09598,0.08125,0.08835,0.10855,0.11076,0.10329
247,1197,2021-06-23 17:20:00,2779.48389,94805.92188,53.38301,3.36102,5.69557,19.51937,16.95965,22.47120,...,25.02226,24.90889,0.05130,0.07769,0.10320,0.08823,0.08203,0.11662,0.11765,0.10805


In [20]:
df = df[df.id != "1510"]
print("How many NaN values exist in the data: ", df.isna().sum().sum())
print("Shape of the data: ", df.shape)


How many NaN values exist in the data:  102460
Shape of the data:  (86278, 168)


In [21]:
df.columns[df.isna().any()].tolist()


['118FIC606',
 '118ZLH303',
 '118LS960',
 '118LS690',
 '118ZLL417',
 '118ZLL427',
 '118ZLL437',
 '118ZLL447',
 '118FI912',
 '118FI913',
 '118PI629',
 '118PI628',
 '118PI924',
 '118PI964',
 '118PI925',
 '118PI639',
 '118PI638',
 '118PI934',
 '118PI952',
 '118PI935',
 '118PI649',
 '118PI648',
 '118PI944',
 '118PI953',
 '118PI945']

In [22]:
df[df.columns[df.isna().any()].tolist()]


Unnamed: 0,118FIC606,118ZLH303,118LS960,118LS690,118ZLL417,118ZLL427,118ZLL437,118ZLL447,118FI912,118FI913,...,118PI639,118PI638,118PI934,118PI952,118PI935,118PI649,118PI648,118PI944,118PI953,118PI945
0,0.00000,False,False,False,False,False,False,False,494.00000,32000.00000,...,1.04723,1.15774,4.86397,6.55000,4.48000,0.67657,1.12341,5.04301,6.54000,4.38000
1,0.00000,False,False,False,False,False,False,False,494.00000,32000.00000,...,1.02787,1.15601,4.84827,6.56000,4.47000,0.65889,1.12299,5.06534,6.55000,4.37000
2,0.00000,False,False,False,False,False,False,False,494.00000,32000.00000,...,1.02049,1.14240,4.88221,6.56000,4.54078,0.64723,1.11823,5.07257,6.55000,4.44000
3,0.00000,True,False,False,False,False,False,False,494.00000,32000.00000,...,1.02595,5.56641,4.71093,6.54000,4.57779,0.54228,4.10582,4.93449,6.53850,4.47000
4,0.00000,True,False,False,False,False,False,False,494.00000,32000.00000,...,0.98232,5.31956,4.81761,6.53000,4.64000,0.57781,3.69783,5.03394,6.52000,4.53000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,,,,,,,,,512.59686,32000.00000,...,1.32319,2.65875,4.69533,6.57568,4.35999,1.45069,2.76375,4.73811,6.56000,4.26000
245,,,,,,,,,512.21533,32000.00000,...,1.32163,2.66578,4.63053,6.59000,4.40000,1.44455,2.75677,4.75931,6.58000,4.30000
246,,,,,,,,,512.46265,32000.00000,...,1.33953,2.67130,4.67738,6.56000,4.42000,1.44473,2.75473,4.69683,6.55000,4.32919
247,,,,,,,,,512.34143,32000.00000,...,1.31678,2.66988,4.69824,6.55000,4.44000,1.46919,2.74868,4.72386,6.54000,4.34256


In [23]:
df.drop(df.columns[df.isna().any()].tolist(), axis=1, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [24]:
df.shape


(86278, 143)

In [25]:
df.drop(["160FI656", "160FI666"], axis=1, inplace=True)


In [26]:
df.select_dtypes(include=["O"])


Unnamed: 0,164FI656,164FI666,164FI667,164TI650,165FI652,165FI654,165FI656,165FI666,165FI667,158TI672,158TI673,118TI977,113LI682,113LI682.1
0,-0.00085,10.83768,-0.00154,36.60553,432.86050,0.02384,-0.00326,11.57410,-0.00647,19.54522,29.19950,-0.76741,0.15946,0.15946
1,-0.00128,8.13376,-0.00099,36.76508,681.21143,0.02189,-0.00275,8.09176,-0.00561,19.54185,29.19003,-3.32031,0.15938,0.15938
2,-0.00145,6.74273,-0.00074,36.91609,950.92322,0.02024,-0.00277,6.76616,-0.00541,19.54449,29.17944,-3.87965,0.15963,0.15963
3,-0.00122,5.64595,0.14649,37.20655,1032.67249,0.01984,-0.00311,5.81708,0.18956,20.11602,29.17311,-3.34251,0.15948,0.15948
4,0.96099,5.23721,0.27813,37.27618,1085.13831,0.02094,0.08358,5.10108,0.21616,21.31817,29.18134,2.70517,0.15960,0.15960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,-0.00117,5.12457,-0.00127,41.56391,1115.79040,-0.03858,-0.00321,5.11245,-0.00553,80.05803,18.65294,107.50488,22.55886,22.55886
245,-0.00098,5.10424,-0.00158,43.50397,1118.41101,-0.04052,-0.00276,5.09023,-0.00569,24.10015,18.68572,107.50488,22.59704,22.59704
246,-0.00159,5.08822,-0.00018,45.47405,1120.83325,-0.03824,-0.00273,5.06691,-0.00660,21.65107,18.71796,107.50488,22.63589,22.63589
247,-0.00107,5.08749,-0.00120,47.43223,1119.77210,-0.03719,-0.00282,5.07242,-0.00535,21.89815,18.72753,107.50488,22.67688,22.67688


In [27]:
C = df.select_dtypes(include=["O"]).columns.values


In [28]:
C


array(['164FI656', '164FI666', '164FI667', '164TI650', '165FI652',
       '165FI654', '165FI656', '165FI666', '165FI667', '158TI672',
       '158TI673', '118TI977', '113LI682', '113LI682.1'], dtype=object)

In [29]:
# try to thinking missing value instead of dropping the data


In [30]:
df.drop(C, axis=1, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [31]:
df.drop(list(df.filter(regex="^164")), axis=1, inplace=True)


In [32]:
df.shape


(86278, 123)

In [33]:
df


Unnamed: 0,id,timeseries,101LI636,101WI610,306LI606,101AI635,101AI605,101TI607,101TI637,306TI604,...,108TI660,108TI664,108PI650,108FI653,108FI657,108FI665,108FI669,108FI673,108FI677,108FI681
0,1769,2021-09-16 02:45:00,94743.25000,96197.11719,28983.25781,5.12101,5.20942,11.48276,13.52487,20.94076,...,20.63876,20.50171,0.04491,0.11542,0.09313,0.08456,0.08083,0.10902,0.10963,0.09213
1,1769,2021-09-16 02:46:00,94927.07031,96202.61719,28955.89844,5.11885,5.20700,11.48991,13.51579,20.94727,...,20.68064,20.49404,0.04877,0.15048,0.10186,0.08769,0.08899,0.11003,0.11538,0.10532
2,1769,2021-09-16 02:47:00,94470.82813,96197.37500,28923.41211,5.12032,5.21198,11.48278,13.51379,20.95803,...,20.67333,20.47291,0.04442,0.04930,0.09773,0.08979,0.09198,0.10886,0.11169,0.10504
3,1769,2021-09-16 02:48:00,94334.87500,96202.85156,28981.38477,5.12036,5.21064,11.48057,13.52005,20.96014,...,20.65566,20.50415,0.05493,0.04008,0.10604,0.08503,0.09302,0.10722,0.10888,0.10001
4,1769,2021-09-16 02:49:00,94146.34375,96209.53906,28896.72266,5.12075,5.21054,11.48902,13.52251,20.96883,...,20.67343,20.52964,0.05217,0.01833,0.09868,0.09563,0.09524,0.10781,0.11302,0.09227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,1197,2021-06-23 17:05:00,2699.54102,88846.57031,0.00000,3.33094,5.67894,19.93162,16.77656,22.34291,...,24.97626,24.85718,0.05493,0.09906,0.10354,0.09034,0.08719,0.10752,0.11198,0.10525
245,1197,2021-06-23 17:10:00,2733.27173,90839.25781,0.00000,3.33267,5.68546,19.77357,16.83838,22.77593,...,24.99808,24.88873,0.05810,0.06977,0.10237,0.09058,0.10104,0.10692,0.11204,0.10247
246,1197,2021-06-23 17:15:00,2759.01514,92892.02344,20.05767,3.34969,5.69014,19.62809,16.90186,22.98542,...,25.02415,24.87848,0.05493,0.10907,0.09598,0.08125,0.08835,0.10855,0.11076,0.10329
247,1197,2021-06-23 17:20:00,2779.48389,94805.92188,53.38301,3.36102,5.69557,19.51937,16.95965,22.47120,...,25.02226,24.90889,0.05130,0.07769,0.10320,0.08823,0.08203,0.11662,0.11765,0.10805


In [34]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 86278 entries, 0 to 248
Columns: 123 entries, id to 108FI681
dtypes: datetime64[ns](1), float64(121), string(1)
memory usage: 81.6 MB


In [35]:
df.shape


(86278, 123)

In [36]:
df


Unnamed: 0,id,timeseries,101LI636,101WI610,306LI606,101AI635,101AI605,101TI607,101TI637,306TI604,...,108TI660,108TI664,108PI650,108FI653,108FI657,108FI665,108FI669,108FI673,108FI677,108FI681
0,1769,2021-09-16 02:45:00,94743.25000,96197.11719,28983.25781,5.12101,5.20942,11.48276,13.52487,20.94076,...,20.63876,20.50171,0.04491,0.11542,0.09313,0.08456,0.08083,0.10902,0.10963,0.09213
1,1769,2021-09-16 02:46:00,94927.07031,96202.61719,28955.89844,5.11885,5.20700,11.48991,13.51579,20.94727,...,20.68064,20.49404,0.04877,0.15048,0.10186,0.08769,0.08899,0.11003,0.11538,0.10532
2,1769,2021-09-16 02:47:00,94470.82813,96197.37500,28923.41211,5.12032,5.21198,11.48278,13.51379,20.95803,...,20.67333,20.47291,0.04442,0.04930,0.09773,0.08979,0.09198,0.10886,0.11169,0.10504
3,1769,2021-09-16 02:48:00,94334.87500,96202.85156,28981.38477,5.12036,5.21064,11.48057,13.52005,20.96014,...,20.65566,20.50415,0.05493,0.04008,0.10604,0.08503,0.09302,0.10722,0.10888,0.10001
4,1769,2021-09-16 02:49:00,94146.34375,96209.53906,28896.72266,5.12075,5.21054,11.48902,13.52251,20.96883,...,20.67343,20.52964,0.05217,0.01833,0.09868,0.09563,0.09524,0.10781,0.11302,0.09227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,1197,2021-06-23 17:05:00,2699.54102,88846.57031,0.00000,3.33094,5.67894,19.93162,16.77656,22.34291,...,24.97626,24.85718,0.05493,0.09906,0.10354,0.09034,0.08719,0.10752,0.11198,0.10525
245,1197,2021-06-23 17:10:00,2733.27173,90839.25781,0.00000,3.33267,5.68546,19.77357,16.83838,22.77593,...,24.99808,24.88873,0.05810,0.06977,0.10237,0.09058,0.10104,0.10692,0.11204,0.10247
246,1197,2021-06-23 17:15:00,2759.01514,92892.02344,20.05767,3.34969,5.69014,19.62809,16.90186,22.98542,...,25.02415,24.87848,0.05493,0.10907,0.09598,0.08125,0.08835,0.10855,0.11076,0.10329
247,1197,2021-06-23 17:20:00,2779.48389,94805.92188,53.38301,3.36102,5.69557,19.51937,16.95965,22.47120,...,25.02226,24.90889,0.05130,0.07769,0.10320,0.08823,0.08203,0.11662,0.11765,0.10805


In [37]:
##Croff Flow Filtration (tab CFF) is a stage composed by 8 loops, from unit 158 to 165. Every loop can sometimes be disconnected for cleaning. You can ignore unit when recirculation temperature XXXTI650 goes over 30°C (setpoint is 25°C)


In [38]:
df[(list(df.filter(regex="TI650")))].isnull().sum()


158TI650    0
159TI650    0
160TI650    0
161TI650    0
162TI650    0
163TI650    0
165TI650    0
dtype: int64

In [39]:
df[(list(df.filter(regex="TI650")))]


Unnamed: 0,158TI650,159TI650,160TI650,161TI650,162TI650,163TI650,165TI650
0,21.92756,26.36086,32.15926,17.77330,33.66693,35.43939,36.29831
1,22.13530,26.32862,32.04381,17.77637,33.79589,35.50781,36.69251
2,22.78241,26.47935,31.87328,17.68566,33.95222,35.66818,36.88313
3,23.75874,26.77229,31.92066,17.67621,34.23772,35.83632,37.08784
4,24.59632,27.14235,31.91190,17.72497,34.39598,35.94086,37.08808
...,...,...,...,...,...,...,...
244,49.53495,48.73644,20.06047,45.74931,42.85270,32.25315,40.87234
245,49.60825,50.16758,19.65021,47.84028,45.03857,32.25212,42.63032
246,50.92138,51.43882,19.31797,49.79881,47.15192,30.82711,44.43759
247,52.38799,52.76213,19.50630,51.63932,49.18754,43.70636,46.23510


In [40]:
units_158_to_165_sensors = set()
for unit in range(158, 166):
    tmp = list(df.filter(regex=f"^{unit}"))
    [units_158_to_165_sensors.add(x.replace(f"{unit}", "")) for x in tmp]
print(units_158_to_165_sensors)


{'CAL4551', 'FI667', 'FI679P', 'FI652', 'TI650', 'TI684', 'PI678', 'FI666', 'FI656', 'FI654', 'TI670', 'CAL4552'}


In [41]:
units_per_sensor = dict()

for sensor in units_158_to_165_sensors:
    units_per_sensor[sensor] = [x.replace(sensor, "") for x in df.filter(regex=sensor)]
print(units_per_sensor)


{'CAL4551': ['158', '159', '160', '161', '162', '163', '165'], 'FI667': ['158', '159', '160', '161', '162', '163'], 'FI679P': ['158'], 'FI652': ['158', '159', '160', '161', '162', '163'], 'TI650': ['158', '159', '160', '161', '162', '163', '165'], 'TI684': ['158'], 'PI678': ['158'], 'FI666': ['158', '159', '161', '162', '163'], 'FI656': ['158', '159', '161', '162', '163'], 'FI654': ['158', '159', '160', '161', '162', '163'], 'TI670': ['158'], 'CAL4552': ['158', '159', '160', '161', '162', '163', '165']}


In [42]:
def filtering(row, sensor, units):
    i = 0
    row_sum = 0
    for unit in units:
        if row[f"{unit}TI650"] < 30:
            i += 1
            row_sum += row[f"{unit}{sensor}"]
    if i > 0:
        return row_sum / i
    else:
        return 0


for sensor, units in units_per_sensor.items():
    df[sensor] = df.agg(filtering, axis=1, sensor=sensor, units=units)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [43]:
# drop units measurements as we have the mean ones

for unit in range(158, 166):
    df.drop(list(df.filter(regex=f"^{unit}")), axis=1, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [44]:
df.shape


(86278, 82)

In [45]:
# permeate that flows out is cooled at 12°C (158TI672- 158TI673), here we have two heat exchanger that works alternately. The asset that works is usually around 12°C.

# df["158TI672"]=df["158TI672"].where((df['158TI672'] < 13),0)
# df["158TI672"]=df["158TI672"].where((df['158TI672'] < 11),1)

# df["158TI672"]=df["158TI672"].where((df['158TI672'] <2),0)

# df["158TI673"]=df["158TI673"].where((df['158TI673'] < 13),0)
# df["158TI673"]=df["158TI673"].where((df['158TI673'] < 11),1)
# df["158TI673"]=df["158TI673"].where((df['158TI673'] <2),0)


In [46]:
# In Nanofiltration we have two paths: 107, 108. Path 107 works when 107PI650 is greater than 10 bar (ignore all 108XXXX signals). Path 108 works when 108PI650 is greater than 10 bar (ignore all 107XXXXX signals).

df[(list(df.filter(regex="107")))]


Unnamed: 0,107LI606,107TI607,107AI610,107PI650,107FI685A,107FI689A,107FI693A,107AI672,107AI673A,107AI674A,107AI677A
0,0.73503,15.31738,5.00959,0.22018,0.05967,0.04751,0.01005,0.00636,0.01763,0.06946,0.01079
1,0.73541,15.31738,5.00886,0.22083,0.05751,0.04892,0.00959,0.00485,0.01767,0.07421,0.01026
2,0.73500,15.31738,5.00971,0.22296,0.05601,0.05042,0.00973,0.00483,0.01711,0.07435,0.00993
3,0.73824,15.31738,5.00991,0.22089,0.05857,0.04684,0.01117,0.00461,0.01714,0.07254,0.01001
4,0.79244,15.31738,5.00289,0.22198,0.05950,0.04747,0.00992,0.00572,0.01755,0.07408,0.00989
...,...,...,...,...,...,...,...,...,...,...,...
244,5.63845,13.50586,5.11984,19.68652,2.04900,1.46511,0.74406,5.14672,4.07947,13.64274,0.03305
245,3.45312,13.50586,5.11933,19.75192,2.04308,1.45558,0.72439,5.10991,4.08975,13.66189,0.03380
246,1.25427,13.50586,5.12120,19.65288,2.06323,1.47140,0.73174,5.10208,3.95935,13.64341,0.03427
247,0.87878,13.52873,5.14217,19.73477,2.07645,1.48439,0.75327,5.11567,3.95857,13.56315,0.03540


In [47]:
units_107_108_sensors = set()
for unit in range(107, 109):
    tmp = list(df.filter(regex=f"^{unit}"))
    print(tmp)
    [units_107_108_sensors.add(x.replace(f"{unit}", "")) for x in tmp]
print(units_107_108_sensors)


['107LI606', '107TI607', '107AI610', '107PI650', '107FI685A', '107FI689A', '107FI693A', '107AI672', '107AI673A', '107AI674A', '107AI677A']
['108AI610', '108AI611', '108AI612', '108AI613', '108TI652', '108TI656', '108TI660', '108TI664', '108PI650', '108FI653', '108FI657', '108FI665', '108FI669', '108FI673', '108FI677', '108FI681']
{'FI685A', 'AI677A', 'FI657', 'AI610', 'FI681', 'FI665', 'TI607', 'PI650', 'AI673A', 'TI656', 'FI689A', 'FI677', 'AI672', 'AI611', 'FI653', 'AI613', 'TI664', 'AI674A', 'AI612', 'TI652', 'FI673', 'FI669', 'FI693A', 'TI660', 'LI606'}


In [48]:
def filtering(row, sensor, units):
    i = 0
    row_sum = 0
    for unit in units:
        if f"{unit}{sensor}" in row and row[f"{unit}PI650"] > 10:
            i += 1
            row_sum += row[f"{unit}{sensor}"]
    if i > 0:
        return row_sum / i
    else:
        return 0


for sensor in units_107_108_sensors:
    df[sensor] = df.agg(filtering, axis=1, sensor=sensor, units=[107, 108])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [49]:
# drop units measurements as we have the mean ones

for unit in range(107, 109):
    df.drop(list(df.filter(regex=f"^{unit}")), axis=1, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [50]:
df.shape


(86278, 80)

In [51]:
list_ext = [
    "118PI946",
    "118PI628",
    "118PI649",
    "113LI682.1",
    "118PI952",
    "118PI953",
    "118PI964",
    "118PI925",
    "118PI945",
    "118PI648",
    "118FI912",
    "118PI934",
    "118PI638",
    "118PI935",
    "118PI629",
    "118PI924",
    "118PI639",
    "118FI913",
    "113LI682",
    "118PI944",
    "118TI977",
    "164FI656",
    "164FI666",
    "164FI667",
    "118PI936",
    "118ZLL417",
    "118ZLL427",
    "118AI641",
    "118PI974",
    "118SI643",
    "163TIC650",
    "118SI633",
    "118ZLL437",
    "118PI927",
    "118PI916",
    "118SI613",
    "118LS690",
    "118ZLH303",
    "164PI653",
    "163PI663",
    "118LS960",
    "118PI917",
    "890PI610",
    "118SI623",
    "118PI937",
    "118PI951",
    "118ZLL447",
    "163PI655",
    "118FIC606",
    "118PI947",
    "164PI655",
    "118AI631",
    "118PI954",
    "118PI984",
    "164PI651",
    "118AI621",
    "118PI926",
    "118AI611",
    "164PI663",
    "163TI650",
    "164FI652",
    "164FI654",
]


In [52]:
df_ext = data_EXT.drop(list_ext, axis=1)


In [53]:
df_ext.columns


Index(['id', 'timeseries', '118PI619', '118PI618', '118PI914', '118PI950',
       '118PI915', '118TI609', '118TI970', '118TI971', '118TI972', '118TI973',
       '118TI978', '118EI610', '118EI620', '118EI630', '118EI640', '118FI983',
       '118FI638', '118AI627', '118AI616', '118FI602', '118FI601',
       '118FI602.1', '109LI606', '109FI616', '118FI982', '118FI606',
       '118CV502', '118CV501', '118FQ602'],
      dtype='object')

In [54]:
df[df_ext.columns[2:].values]


Unnamed: 0,118PI619,118PI618,118PI914,118PI950,118PI915,118TI609,118TI970,118TI971,118TI972,118TI973,...,118FI602,118FI601,118FI602.1,109LI606,109FI616,118FI982,118FI606,118CV502,118CV501,118FQ602
0,0.59894,1.10740,5.01341,6.56000,4.47000,26.38305,38.13971,26.99951,35.18014,39.16344,...,-0.08860,-0.03274,-0.08860,0.07969,1.49410,-0.02440,4.03538,0.00000,0.00000,1897.44263
1,0.58451,1.10358,5.04961,6.56000,4.46000,26.37564,38.03940,26.99951,35.22678,39.02181,...,-0.08967,-0.03168,-0.08967,0.07967,1.36081,-0.02351,3.71492,0.00000,0.00000,1897.44263
2,0.58986,1.10118,5.04501,6.56000,4.53000,26.35686,37.93900,27.02450,35.31494,38.97416,...,-0.08560,-0.03122,-0.08560,0.07969,1.14422,-0.02344,3.79255,0.00000,0.00000,1897.44263
3,0.54386,2.08640,4.88863,6.55000,4.56319,26.37872,37.81296,27.02319,35.43432,38.84520,...,-0.09916,-0.03285,-0.09916,0.07958,1.75029,-0.03558,3.13228,0.00000,0.00000,1897.44263
4,0.57975,2.08140,4.98854,6.53150,4.62761,26.36582,37.74115,26.99951,35.82477,38.74739,...,-0.09613,-0.02816,-0.09613,0.07952,1.46876,-0.04036,2.93423,0.00000,0.00000,1897.44263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,1.58975,1.65129,4.73206,6.58000,4.34000,28.43818,25.63965,24.24837,19.61312,13.18237,...,109.28312,70.12413,109.28312,0.25866,2102.52661,6.55403,6308.12939,34.63267,76.95258,2038.48353
245,1.64878,1.64787,4.73485,6.59789,4.38000,28.50724,25.65096,24.24767,19.49219,13.34313,...,109.32439,69.86214,109.32439,0.25593,2106.13184,6.80299,6300.91699,35.06778,77.10796,2047.59759
246,1.64036,1.64477,4.69527,6.56000,4.40000,28.53372,25.63662,24.21678,19.50024,13.34351,...,110.23349,69.79355,110.23349,0.25823,2107.05493,6.97655,6293.92432,35.56608,77.14694,2056.72167
247,1.58151,1.65030,4.69243,6.55861,4.42222,28.56410,25.65510,24.21631,19.44940,13.40053,...,110.58638,70.36667,110.58638,0.25849,2078.64575,6.50380,6298.96094,36.01296,77.06838,2065.91548


In [55]:
df.loc[df["118CV502"] <= 0, list(df[df_ext.columns[2:].values].columns)] = 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [56]:
df.loc[df["118CV501"] <= 0, list(df[df_ext.columns[2:].values].columns)] = 0


In [57]:
df[df_ext.columns[2:].values]


Unnamed: 0,118PI619,118PI618,118PI914,118PI950,118PI915,118TI609,118TI970,118TI971,118TI972,118TI973,...,118FI602,118FI601,118FI602.1,109LI606,109FI616,118FI982,118FI606,118CV502,118CV501,118FQ602
0,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
1,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
2,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
3,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
4,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,1.58975,1.65129,4.73206,6.58000,4.34000,28.43818,25.63965,24.24837,19.61312,13.18237,...,109.28312,70.12413,109.28312,0.25866,2102.52661,6.55403,6308.12939,34.63267,76.95258,2038.48353
245,1.64878,1.64787,4.73485,6.59789,4.38000,28.50724,25.65096,24.24767,19.49219,13.34313,...,109.32439,69.86214,109.32439,0.25593,2106.13184,6.80299,6300.91699,35.06778,77.10796,2047.59759
246,1.64036,1.64477,4.69527,6.56000,4.40000,28.53372,25.63662,24.21678,19.50024,13.34351,...,110.23349,69.79355,110.23349,0.25823,2107.05493,6.97655,6293.92432,35.56608,77.14694,2056.72167
247,1.58151,1.65030,4.69243,6.55861,4.42222,28.56410,25.65510,24.21631,19.44940,13.40053,...,110.58638,70.36667,110.58638,0.25849,2078.64575,6.50380,6298.96094,36.01296,77.06838,2065.91548


In [90]:
# Extracting the Resa variable from produzione_CStOA_2021_ed12.xlsx
tdf = pd.read_excel(
    "../../../data/input/11_Dataset/produzione_CStOA_2021_ed12.xlsx",
    sheet_name="dati-produzione",
    header=1,
)
tdf = tdf[["O.D.P.", "Resa"]]
tdf.dropna(axis=0, how="any", inplace=True)
tdf["O.D.P."] = tdf["O.D.P."].astype(int).astype(str).str[-4:]
tdf.columns = ["id", "result"]
tdf.result = round(tdf.result, 6)


In [91]:
tdf


Unnamed: 0,id,result
0,0015,0.61663
1,0016,0.85617
2,0017,0.71878
3,0018,0.72152
4,0019,0.78281
...,...,...
221,1965,0.78070
222,2057,0.77097
223,2058,0.72896
224,2059,0.74750


In [92]:
df = tdf.merge(df, how="inner")


In [93]:
df.shape


(85088, 86)

In [94]:
# df.drop("CONCTOTALE",axis=1,inplace=True)


In [95]:
tdf = df.groupby(["id"])["timeseries"].agg(["min", "max"]).reset_index()
tdf.columns = ["id", "start_date", "end_date"]
tdf["processing_time_mins"] = (
    (tdf["end_date"] - tdf["start_date"]) / pd.Timedelta(minutes=5)
) + 1
df = tdf.merge(df, how="right")
df.insert(5, "timestamp_index", df.groupby("id").cumcount())
df[
    ["id", "start_date", "end_date", "processing_time_mins", "result"]
].drop_duplicates().reset_index(drop=True)


ValueError: cannot insert timestamp_index, already exists

In [96]:
# for i in range(0, 101, 1):
#     print(df['DIAFTOTALE'].quantile(i/1000))
df.loc[df["DIAFTOTALE"] > 3, "DIAFTOTALE"] = 2.59
df.loc[df["DIAFTOTALE"] < -1, "DIAFTOTALE"] = -0.4


In [97]:
df.id.nunique()


99

In [98]:
df[500:]


Unnamed: 0,id,start_date,end_date,processing_time_mins,result,timestamp_index,progress_perc,timeseries,101LI636,101WI610,...,AI613,TI664,AI674A,AI612,TI652,FI673,FI669,FI693A,TI660,LI606
500,0898,2021-05-12 17:30:00,2021-05-13 15:35:00,266.00000,0.79727,0,0,2021-05-12 17:30:00,95764.66406,96201.79688,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
501,0898,2021-05-12 17:30:00,2021-05-13 15:35:00,266.00000,0.79727,1,0,2021-05-12 17:35:00,96054.96094,96220.96094,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
502,0898,2021-05-12 17:30:00,2021-05-13 15:35:00,266.00000,0.79727,2,1,2021-05-12 17:40:00,95181.43750,96182.90625,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
503,0898,2021-05-12 17:30:00,2021-05-13 15:35:00,266.00000,0.79727,3,1,2021-05-12 17:45:00,94393.28906,96161.73438,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
504,0898,2021-05-12 17:30:00,2021-05-13 15:35:00,266.00000,0.79727,4,2,2021-05-12 17:50:00,92718.63281,96168.07813,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85083,2202,2021-11-05 19:50:00,2021-11-07 00:26:00,344.20000,0.76485,1712,497,2021-11-07 00:22:00,2537.71460,12344.57422,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
85084,2202,2021-11-05 19:50:00,2021-11-07 00:26:00,344.20000,0.76485,1713,498,2021-11-07 00:23:00,2546.81177,12351.03418,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
85085,2202,2021-11-05 19:50:00,2021-11-07 00:26:00,344.20000,0.76485,1714,498,2021-11-07 00:24:00,2542.15967,12350.69531,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
85086,2202,2021-11-05 19:50:00,2021-11-07 00:26:00,344.20000,0.76485,1715,498,2021-11-07 00:25:00,2538.84106,12338.34863,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000


In [99]:
# df.drop(columns=['progress_perc'],inplace=True)
# Insert the progress column which measures progress in percentage terms
df.insert(
    6,
    "progress_perc",
    round(((df.timestamp_index / df.processing_time_mins) * 100), 0).astype(int),
)


ValueError: cannot insert progress_perc, already exists

In [None]:
# 0.9 - 82
# 0.8 - 118
# 0.7 - 132
# df_7 = udf.eliminate_corr(df,thresh = 0.7)


In [None]:
# df_7.insert(6, 'progress_perc', round(((df_7.timestamp_index / df_7.processing_time_mins)*100),0).astype(int))


In [None]:
# df_7.columns


In [100]:
df_std = (
    df.groupby(
        [
            "id",
            "start_date",
            "end_date",
            "processing_time_mins",
            "result",
            "progress_perc",
        ]
    )[df.columns[8:].tolist()]
    .mean()
    .reset_index()
)


In [101]:
df_std.head(50)


Unnamed: 0,id,start_date,end_date,processing_time_mins,result,progress_perc,101LI636,101WI610,306LI606,101AI635,...,AI613,TI664,AI674A,AI612,TI652,FI673,FI669,FI693A,TI660,LI606
0,896,2021-05-10 16:20:00,2021-05-11 15:25:00,278.0,0.79592,0,96203.16407,29144.66016,0.0,5.50565,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,896,2021-05-10 16:20:00,2021-05-11 15:25:00,278.0,0.79592,1,92894.15886,40991.07943,0.0,5.52074,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,896,2021-05-10 16:20:00,2021-05-11 15:25:00,278.0,0.79592,2,88882.37109,47604.0586,0.0,5.53474,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,896,2021-05-10 16:20:00,2021-05-11 15:25:00,278.0,0.79592,3,86608.85417,49101.83854,0.0,5.54748,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,896,2021-05-10 16:20:00,2021-05-11 15:25:00,278.0,0.79592,4,84974.03385,51658.46745,0.0,5.56266,...,0.15712,13.99767,0.0,32.25146,13.24648,1.63624,2.24774,0.0,13.51898,0.0
5,896,2021-05-10 16:20:00,2021-05-11 15:25:00,278.0,0.79592,5,83392.72917,56673.7526,0.0,5.57289,...,0.15687,13.91652,0.0,32.25146,13.08991,1.29315,2.06343,0.0,13.40356,0.0
6,896,2021-05-10 16:20:00,2021-05-11 15:25:00,278.0,0.79592,6,80978.68229,56968.14713,0.0,5.52547,...,0.15598,13.8605,0.0,32.25146,13.00465,1.16784,1.96828,0.0,13.33299,0.0
7,896,2021-05-10 16:20:00,2021-05-11 15:25:00,278.0,0.79592,7,78090.0625,56953.33398,0.0,5.21485,...,0.15535,13.8449,0.0,32.25146,12.96555,1.2617,1.94816,0.0,13.29983,0.0
8,896,2021-05-10 16:20:00,2021-05-11 15:25:00,278.0,0.79592,8,84139.16406,49120.98177,0.0,5.25729,...,0.15576,13.8543,0.0,32.25146,12.93455,1.14064,1.83848,0.0,13.28595,0.0
9,896,2021-05-10 16:20:00,2021-05-11 15:25:00,278.0,0.79592,9,95018.20312,36728.2487,0.0,5.22413,...,0.15578,13.84758,0.0,32.25146,12.93313,1.14742,1.91329,0.0,13.26669,0.0


In [102]:
x_df = df_std.groupby(["id"]).describe()
x_df.columns = ["_".join(col).strip() for col in x_df.columns.values]
x_df.reset_index(inplace=True)
x_df = x_df[x_df.columns.drop(list(x_df.filter(regex="_count")))]
x_df


Unnamed: 0,id,processing_time_mins_mean,processing_time_mins_std,processing_time_mins_min,processing_time_mins_25%,processing_time_mins_50%,processing_time_mins_75%,processing_time_mins_max,result_mean,result_std,...,TI660_50%,TI660_75%,TI660_max,LI606_mean,LI606_std,LI606_min,LI606_25%,LI606_50%,LI606_75%,LI606_max
0,0896,278.00000,0.00000,278.00000,278.00000,278.00000,278.00000,278.00000,0.79592,0.00000,...,0.00000,13.51898,24.74832,14.04976,18.55587,0.00000,0.00000,0.00000,27.31050,69.36427
1,0897,222.00000,0.00000,222.00000,222.00000,222.00000,222.00000,222.00000,0.73886,0.00000,...,0.00000,0.00000,0.00000,19.73819,11.11909,0.00000,11.28000,20.55303,29.75998,34.67152
2,0898,266.00000,0.00000,266.00000,266.00000,266.00000,266.00000,266.00000,0.79727,0.00000,...,0.00000,13.02307,14.74795,15.61302,17.02709,0.00000,0.00000,3.11362,31.78781,44.15539
3,0899,275.00000,0.00000,275.00000,275.00000,275.00000,275.00000,275.00000,0.72822,0.00000,...,0.00000,0.00000,30.91884,18.60329,13.85386,0.00000,0.00000,19.93956,30.04637,42.26604
4,0900,196.00000,0.00000,196.00000,196.00000,196.00000,196.00000,196.00000,0.74258,0.00000,...,0.00000,13.26305,18.03863,12.15883,14.27041,0.00000,0.00000,0.00000,27.76074,34.77811
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,1965,284.20000,0.00000,284.20000,284.20000,284.20000,284.20000,284.20000,0.78070,0.00000,...,0.00000,13.19638,13.98155,6.96887,6.71967,0.00000,0.00000,8.18982,13.02913,18.81689
95,2057,309.80000,0.00000,309.80000,309.80000,309.80000,309.80000,309.80000,0.77097,0.00000,...,0.00000,0.00000,0.00000,13.94575,5.85341,0.00000,10.13142,14.18115,18.71430,23.96156
96,2058,275.00000,0.00000,275.00000,275.00000,275.00000,275.00000,275.00000,0.72896,0.00000,...,13.04074,13.12247,13.30781,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
97,2059,297.40000,0.00000,297.40000,297.40000,297.40000,297.40000,297.40000,0.74750,0.00000,...,0.00000,13.08864,13.85659,8.88384,9.39331,0.00000,0.00000,7.04344,17.65503,25.66498


In [103]:
df_std.columns[5:]


Index(['progress_perc', '101LI636', '101WI610', '306LI606', '101AI635',
       '101AI605', '101TI607', '101TI637', '306TI604', 'DWTOTALE',
       'PERMTOTALE', 'DIAFTOTALE', 'CONCTOTALE', '118PI619', '118PI618',
       '118PI914', '118PI950', '118PI915', '118TI609', '118TI970', '118TI971',
       '118TI972', '118TI973', '118TI978', '118EI610', '118EI620', '118EI630',
       '118EI640', '118FI983', '118FI638', '118AI627', '118AI616', '118FI602',
       '118FI601', '118FI602.1', '109LI606', '109FI616', '118FI982',
       '118FI606', '118CV502', '118CV501', '118FQ602', 'CAL4551', 'FI667',
       'FI679P', 'FI652', 'TI650', 'TI684', 'PI678', 'FI666', 'FI656', 'FI654',
       'TI670', 'CAL4552', 'FI685A', 'AI677A', 'FI657', 'AI610', 'FI681',
       'FI665', 'TI607', 'PI650', 'AI673A', 'TI656', 'FI689A', 'FI677',
       'AI672', 'AI611', 'FI653', 'AI613', 'TI664', 'AI674A', 'AI612', 'TI652',
       'FI673', 'FI669', 'FI693A', 'TI660', 'LI606'],
      dtype='object')

In [104]:
# Suspected change points
change_point_vars = df_std.columns[5:].insert(0, "id")
df_change_points = df_std[
    df_std["progress_perc"].isin([0, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
]
df_change_points = df_change_points[change_point_vars]
# df_change_points
melt_df = pd.melt(df_change_points, id_vars=["id", "progress_perc"])
melt_df["variable"] = (
    melt_df["variable"].astype(str) + "_" + melt_df["progress_perc"].astype(str)
)
melt_df.drop(columns=["progress_perc"], inplace=True)
df_change_points = melt_df.pivot(index="id", columns="variable", values="value")
df_change_points.reset_index(inplace=True)


In [105]:
df_change_points


variable,id,101AI605_0,101AI605_10,101AI605_100,101AI605_20,101AI605_30,101AI605_40,101AI605_5,101AI605_50,101AI605_60,...,TI684_100,TI684_20,TI684_30,TI684_40,TI684_5,TI684_50,TI684_60,TI684_70,TI684_80,TI684_90
0,0896,5.69792,5.88860,6.23392,6.17455,6.19555,6.11504,5.83420,6.15290,6.09485,...,24.33792,23.96087,24.10828,24.22435,24.11117,24.17892,24.04648,24.15430,12.06991,0.00000
1,0897,5.84284,5.87906,4.42709,5.18858,3.88270,3.60501,5.86637,3.80903,3.97749,...,0.00000,23.89859,23.88470,24.10971,24.29035,24.16150,24.15411,24.19563,24.23189,0.00000
2,0898,5.71475,5.78457,3.71936,5.81487,5.23565,5.29581,5.75736,5.32010,5.33128,...,0.00000,24.47857,24.52174,24.48482,24.51349,24.44332,24.49361,24.51774,24.39658,24.34800
3,0899,5.52975,5.68410,5.53693,5.67445,5.43944,5.44094,5.60349,5.46303,5.47783,...,0.00000,23.87869,23.91874,24.00337,23.89678,24.00657,24.03361,24.02628,23.98199,0.00000
4,0900,5.59293,5.59648,,5.59486,4.15009,4.26972,5.59253,4.34555,4.37478,...,,24.33392,24.34308,24.36632,24.30759,24.41959,24.44852,24.38352,24.38288,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,1965,5.58889,5.59050,5.31355,5.59003,5.59251,5.59294,5.58917,5.59526,5.62908,...,22.51414,22.52139,22.52465,22.49421,22.65050,22.49482,22.49863,22.52187,22.50621,22.48415
95,2057,5.36308,5.36392,5.38986,5.36623,5.36828,5.37186,5.36425,5.37207,5.37742,...,22.49207,22.47288,22.49216,22.52601,22.53556,22.50231,22.49854,22.51409,22.49633,22.49386
96,2058,5.27307,5.12795,5.13900,5.13081,5.13225,5.13699,5.27519,5.13783,5.13873,...,22.97257,23.00586,23.01416,22.99579,23.47352,22.99023,23.00640,23.00344,23.00510,22.99511
97,2059,5.69382,5.84623,6.10114,5.91093,5.94284,5.96251,5.73022,6.04770,6.05929,...,22.97618,22.91093,23.00035,22.92567,20.77794,22.95008,22.96184,22.99939,22.98114,23.05203


In [106]:
fdf = x_df.merge(df_change_points, on="id")
fdf


Unnamed: 0,id,processing_time_mins_mean,processing_time_mins_std,processing_time_mins_min,processing_time_mins_25%,processing_time_mins_50%,processing_time_mins_75%,processing_time_mins_max,result_mean,result_std,...,TI684_100,TI684_20,TI684_30,TI684_40,TI684_5,TI684_50,TI684_60,TI684_70,TI684_80,TI684_90
0,0896,278.00000,0.00000,278.00000,278.00000,278.00000,278.00000,278.00000,0.79592,0.00000,...,24.33792,23.96087,24.10828,24.22435,24.11117,24.17892,24.04648,24.15430,12.06991,0.00000
1,0897,222.00000,0.00000,222.00000,222.00000,222.00000,222.00000,222.00000,0.73886,0.00000,...,0.00000,23.89859,23.88470,24.10971,24.29035,24.16150,24.15411,24.19563,24.23189,0.00000
2,0898,266.00000,0.00000,266.00000,266.00000,266.00000,266.00000,266.00000,0.79727,0.00000,...,0.00000,24.47857,24.52174,24.48482,24.51349,24.44332,24.49361,24.51774,24.39658,24.34800
3,0899,275.00000,0.00000,275.00000,275.00000,275.00000,275.00000,275.00000,0.72822,0.00000,...,0.00000,23.87869,23.91874,24.00337,23.89678,24.00657,24.03361,24.02628,23.98199,0.00000
4,0900,196.00000,0.00000,196.00000,196.00000,196.00000,196.00000,196.00000,0.74258,0.00000,...,,24.33392,24.34308,24.36632,24.30759,24.41959,24.44852,24.38352,24.38288,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,1965,284.20000,0.00000,284.20000,284.20000,284.20000,284.20000,284.20000,0.78070,0.00000,...,22.51414,22.52139,22.52465,22.49421,22.65050,22.49482,22.49863,22.52187,22.50621,22.48415
95,2057,309.80000,0.00000,309.80000,309.80000,309.80000,309.80000,309.80000,0.77097,0.00000,...,22.49207,22.47288,22.49216,22.52601,22.53556,22.50231,22.49854,22.51409,22.49633,22.49386
96,2058,275.00000,0.00000,275.00000,275.00000,275.00000,275.00000,275.00000,0.72896,0.00000,...,22.97257,23.00586,23.01416,22.99579,23.47352,22.99023,23.00640,23.00344,23.00510,22.99511
97,2059,297.40000,0.00000,297.40000,297.40000,297.40000,297.40000,297.40000,0.74750,0.00000,...,22.97618,22.91093,23.00035,22.92567,20.77794,22.95008,22.96184,22.99939,22.98114,23.05203


In [107]:
fdf.columns[:30]


Index(['id', 'processing_time_mins_mean', 'processing_time_mins_std',
       'processing_time_mins_min', 'processing_time_mins_25%',
       'processing_time_mins_50%', 'processing_time_mins_75%',
       'processing_time_mins_max', 'result_mean', 'result_std', 'result_min',
       'result_25%', 'result_50%', 'result_75%', 'result_max',
       'progress_perc_mean', 'progress_perc_std', 'progress_perc_min',
       'progress_perc_25%', 'progress_perc_50%', 'progress_perc_75%',
       'progress_perc_max', '101LI636_mean', '101LI636_std', '101LI636_min',
       '101LI636_25%', '101LI636_50%', '101LI636_75%', '101LI636_max',
       '101WI610_mean'],
      dtype='object')

In [108]:
df_new = fdf.drop(
    [
        "processing_time_mins_mean",
        "processing_time_mins_std",
        "processing_time_mins_min",
        "processing_time_mins_25%",
        "processing_time_mins_50%",
        "processing_time_mins_75%",
        "processing_time_mins_max",
        "result_mean",
        "result_std",
        "result_min",
        "result_25%",
        "result_50%",
        "result_75%",
        "result_max",
        "progress_perc_mean",
        "progress_perc_std",
        "progress_perc_min",
        "progress_perc_25%",
        "progress_perc_50%",
        "progress_perc_75%",
        "progress_perc_max",
    ],
    axis=1,
)


In [109]:
df_new.shape


(99, 1483)

In [110]:
print("How many NaN values exist in the data: ", df_new.isna().sum().sum())


How many NaN values exist in the data:  2418


In [111]:
df_new1 = df_new.dropna(axis=0)


In [112]:
df_new1.shape


(92, 1483)

In [113]:
# Extracting the Resa variable from produzione_CStOA_2021_ed12.xlsx
tdf = pd.read_excel(
    "../../../data/input/11_Dataset/produzione_CStOA_2021_ed12.xlsx",
    sheet_name="dati-produzione",
    header=1,
)
tdf = tdf[["O.D.P.", "Resa"]]
tdf.dropna(axis=0, how="any", inplace=True)
tdf["O.D.P."] = tdf["O.D.P."].astype(int).astype(str).str[-4:]
tdf.columns = ["id", "result"]
tdf.result = round(tdf.result, 6)


In [114]:
df_new1 = tdf.merge(df_new1, how="inner")


In [115]:
df_new1.shape


(92, 1484)

In [116]:
df_new1


Unnamed: 0,id,result,101LI636_mean,101LI636_std,101LI636_min,101LI636_25%,101LI636_50%,101LI636_75%,101LI636_max,101WI610_mean,...,TI684_100,TI684_20,TI684_30,TI684_40,TI684_5,TI684_50,TI684_60,TI684_70,TI684_80,TI684_90
0,0896,0.79592,49416.32403,34084.84151,3091.41886,13795.41503,52445.04948,82861.18229,96203.16407,12706.84522,...,24.33792,23.96087,24.10828,24.22435,24.11117,24.17892,24.04648,24.15430,12.06991,0.00000
1,0897,0.73886,50439.99821,32163.84722,1106.49274,17792.05665,53218.55339,81633.57812,96043.86719,10378.21695,...,0.00000,23.89859,23.88470,24.10971,24.29035,24.16150,24.15411,24.19563,24.23189,0.00000
2,0898,0.79727,51110.58899,33566.75788,1137.18258,20607.01237,52214.87109,85245.08073,96475.15104,56276.22307,...,0.00000,24.47857,24.52174,24.48482,24.51349,24.44332,24.49361,24.51774,24.39658,24.34800
3,0899,0.72822,59074.96738,32299.84845,906.25909,25859.84440,75012.32292,87863.32813,96313.67188,36249.27922,...,0.00000,23.87869,23.91874,24.00337,23.89678,24.00657,24.03361,24.02628,23.98199,0.00000
4,0948,0.73557,62908.78294,31568.15157,1559.61023,33423.11914,77968.80469,91337.09636,97860.84375,57924.47829,...,0.00000,24.03285,24.08220,24.02252,23.92029,23.98308,24.04671,24.09822,23.93077,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,1965,0.78070,73849.45205,19548.64384,22482.74219,69090.38542,80222.09375,87756.83334,97717.54688,62490.24189,...,22.51414,22.52139,22.52465,22.49421,22.65050,22.49482,22.49863,22.52187,22.50621,22.48415
88,2057,0.77097,70822.42786,27474.21292,2369.76554,54518.43620,83784.69792,90583.50261,98411.22656,66855.28522,...,22.49207,22.47288,22.49216,22.52601,22.53556,22.50231,22.49854,22.51409,22.49633,22.49386
89,2058,0.72896,63721.43828,32187.71415,2440.35278,35897.97103,80437.22656,89879.06380,97456.53906,25848.38759,...,22.97257,23.00586,23.01416,22.99579,23.47352,22.99023,23.00640,23.00344,23.00510,22.99511
90,2059,0.74750,57731.74685,32791.68068,2416.59928,30763.88054,69510.09375,86391.06055,98481.57552,19189.73585,...,22.97618,22.91093,23.00035,22.92567,20.77794,22.95008,22.96184,22.99939,22.98114,23.05203


In [117]:
print("How many NaN values exist in the data: ", df_new1.isna().sum().sum())


How many NaN values exist in the data:  0


In [118]:
df_new1.to_csv("../../../data/input/suanfarma_train_data_rules_round_6.csv")
