In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as st
import plotly.express as px
import matplotlib.pyplot as plt
pd.set_option('display.float_format', lambda x: '%.5f' % x)

from tqdm import tqdm
import glob
import sys  
sys.path.insert(0, '../../nagesh_kommuri/src')

# importing user-defined functions from udf_eda.py
import udf_eda as udf
import udf_timeseries as udf_ts

In [2]:
file_names = glob.glob('../../../data/input/11_Dataset/**/ODP *.xlsx')

In [3]:
file_names[:2]

['../../../data/input/11_Dataset/20211012/ODP 100001512.xlsx',
 '../../../data/input/11_Dataset/20211012/ODP 100001952.xlsx']

In [7]:
df = udf.read_bind(file_names)

100%|██████████| 45/45 [04:40<00:00,  6.23s/it]


The following batches have incompatible data:  ['1778', '1576']
# of batches read:  41
Missing batches, if any: {'1573', '1777'}


In [8]:
df

Unnamed: 0,id,timeseries,101LI636,101WI610,306LI606,101AI635,101AI605,101TI607,101TI637,306TI604,...,Unnamed: 139,Unnamed: 122,Unnamed: 124,Unnamed: 126,Unnamed: 128,Unnamed: 130,Unnamed: 141,Unnamed: 143,Unnamed: 145,Unnamed: 147
0,1512,2021-08-14 23:30:00,93151.39844,98294.57813,0.00000,5.10371,5.91796,12.17896,13.36066,19.91798,...,,,,,,,,,,
1,1512,2021-08-14 23:31:00,92774.96875,98311.64063,0.00000,5.10458,5.91850,12.19247,13.36455,19.92311,...,,,,,,,,,,
2,1512,2021-08-14 23:32:00,92563.79688,98294.81250,0.00000,5.10496,5.91805,12.19195,13.33857,19.91615,...,,,,,,,,,,
3,1512,2021-08-14 23:33:00,92337.44531,98290.10156,0.00000,5.10927,5.91845,12.19305,13.33974,19.93248,...,,,,,,,,,,
4,1512,2021-08-14 23:34:00,91675.15625,98304.65625,0.00000,5.10713,5.91787,12.20043,13.33574,19.95224,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1382,1701,2021-09-09 22:37:00,82405.95313,33070.69922,85118.58594,5.30540,5.54348,11.04324,13.13375,21.60054,...,,,,,,,,,,
1383,1701,2021-09-09 22:38:00,82410.16406,33068.64453,85110.12500,5.30350,5.54147,11.01961,13.14960,21.61064,...,,,,,,,,,,
1384,1701,2021-09-09 22:39:00,82412.42188,33068.34375,85057.60156,5.30322,5.54361,11.00541,13.12194,21.61058,...,,,,,,,,,,
1385,1701,2021-09-09 22:40:00,82406.48438,33060.69922,85024.37500,5.30635,5.54131,10.98045,13.11768,21.58987,...,,,,,,,,,,


In [9]:
# if any blank columns are created by accident in a spreadsheet software, which wouldn't have any column name, we remove such columns
df = df[df.columns.drop(list(df.filter(regex='Unnamed:')))]
print("How many NaN values exist in the data: ", df.isna().sum().sum())
print("Shape of the data: ",df.shape)

How many NaN values exist in the data:  130854
Shape of the data:  (53834, 265)


In [10]:
df.shape

(53834, 265)

In [11]:
df

Unnamed: 0,id,timeseries,101LI636,101WI610,306LI606,101AI635,101AI605,101TI607,101TI637,306TI604,...,108PI659,108PI662,108PI663,108FI653,108FI657,108FI665,108FI669,108FI673,108FI677,108FI681
0,1512,2021-08-14 23:30:00,93151.39844,98294.57813,0.00000,5.10371,5.91796,12.17896,13.36066,19.91798,...,0.17087,-0.00583,0.11040,0.11398,0.10268,0.08283,0.08319,0.11911,0.11308,0.10691
1,1512,2021-08-14 23:31:00,92774.96875,98311.64063,0.00000,5.10458,5.91850,12.19247,13.36455,19.92311,...,0.16638,-0.00399,0.10999,0.04455,0.09669,0.09162,0.08817,0.10447,0.11574,0.10316
2,1512,2021-08-14 23:32:00,92563.79688,98294.81250,0.00000,5.10496,5.91805,12.19195,13.33857,19.91615,...,0.16777,-0.00965,0.11963,0.14188,0.10196,0.08681,0.08490,0.11631,0.11960,0.11522
3,1512,2021-08-14 23:33:00,92337.44531,98290.10156,0.00000,5.10927,5.91845,12.19305,13.33974,19.93248,...,0.16682,-0.00685,0.10951,0.05276,0.10534,0.09464,0.08846,0.11252,0.11401,0.10517
4,1512,2021-08-14 23:34:00,91675.15625,98304.65625,0.00000,5.10713,5.91787,12.20043,13.33574,19.95224,...,0.17090,-0.00753,0.12073,0.04119,0.10110,0.09366,0.09375,0.10299,0.10787,0.08498
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1382,1701,2021-09-09 22:37:00,82405.95313,33070.69922,85118.58594,5.30540,5.54348,11.04324,13.13375,21.60054,...,1.98089,-0.02625,0.09468,1.78485,1.55033,0.08275,0.07252,0.10943,0.10575,0.09579
1383,1701,2021-09-09 22:38:00,82410.16406,33068.64453,85110.12500,5.30350,5.54147,11.01961,13.14960,21.61064,...,6.19787,-0.02625,0.09701,1.82199,1.65050,0.08176,0.08477,0.10018,0.10283,0.09452
1384,1701,2021-09-09 22:39:00,82412.42188,33068.34375,85057.60156,5.30322,5.54361,11.00541,13.12194,21.61058,...,6.20957,-0.02559,0.10168,1.98877,1.74241,0.08115,0.07711,0.10856,0.10334,0.10498
1385,1701,2021-09-09 22:40:00,82406.48438,33060.69922,85024.37500,5.30635,5.54131,10.98045,13.11768,21.58987,...,6.23108,-0.03219,0.09791,1.98764,1.83296,0.08520,0.07892,0.10680,0.10287,0.09021


In [12]:
df = df[df.id != '1510']
print("How many NaN values exist in the data: ", df.isna().sum().sum())
print("Shape of the data: ",df.shape)

How many NaN values exist in the data:  0
Shape of the data:  (52676, 265)


In [37]:
# Extracting the Resa variable from produzione_CStOA_2021_ed12.xlsx
tdf = pd.read_excel('../../../data/input/11_Dataset/produzione_CStOA_2021_ed12.xlsx', sheet_name="dati-produzione", header=1)
tdf = tdf[['O.D.P.','Resa']]
tdf.dropna(axis=0, how='any', inplace=True)
tdf['O.D.P.'] =tdf['O.D.P.'].astype(int).astype(str).str[-4:]
tdf.columns = ['id', 'result']
tdf.result = round(tdf.result,3)

In [38]:
tdf

Unnamed: 0,id,result
0,0015,0.61700
1,0016,0.85600
2,0017,0.71900
3,0018,0.72200
4,0019,0.78300
...,...,...
201,1778,0.75000
202,1779,0.69000
203,1780,0.76400
204,1948,0.73300


In [15]:
df = tdf.merge(df, how='inner')

In [16]:
df

Unnamed: 0,id,result,timeseries,101LI636,101WI610,306LI606,101AI635,101AI605,101TI607,101TI637,...,108PI659,108PI662,108PI663,108FI653,108FI657,108FI665,108FI669,108FI673,108FI677,108FI681
0,1511,0.77300,2021-08-12 15:45:00,81477.29688,21599.61719,0.00000,5.51934,5.47237,22.28649,21.63257,...,0.19053,-0.00131,0.11143,0.07196,0.09553,0.08337,0.09170,0.10912,0.11009,0.10120
1,1511,0.77300,2021-08-12 15:46:00,81779.82813,22878.83203,0.00000,5.52148,5.47525,22.25741,21.56869,...,0.19943,-0.00977,0.11113,0.13773,0.10828,0.08770,0.08412,0.11635,0.11269,0.11215
2,1511,0.77300,2021-08-12 15:47:00,82196.60938,24160.41211,0.00000,5.52057,5.47622,22.21622,21.47825,...,0.19490,-0.00120,0.10793,0.03334,0.10950,0.09661,0.09661,0.10159,0.11203,0.09815
3,1511,0.77300,2021-08-12 15:48:00,82719.15625,25430.91016,0.00000,5.52429,5.47810,22.21272,21.40980,...,0.19552,-0.00595,0.10315,0.03987,0.10044,0.09077,0.09199,0.10551,0.10990,0.09696
4,1511,0.77300,2021-08-12 15:49:00,83419.19531,26682.87500,0.00000,5.52440,5.48099,22.18258,21.32355,...,0.19390,-0.00408,0.11148,0.10552,0.09854,0.08863,0.08935,0.11219,0.10966,0.09689
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49856,1949,0.75900,2021-10-05 00:55:00,2231.38086,1426.05713,0.00000,3.74282,4.87921,16.10916,14.14818,...,0.16151,-0.04031,0.09843,0.04526,0.09306,0.08371,0.08335,0.10280,0.10907,0.09397
49857,1949,0.75900,2021-10-05 00:56:00,2231.68579,1436.05176,0.00000,3.74502,4.88274,16.11393,14.16090,...,0.16558,-0.04496,0.09614,0.12657,0.09976,0.07932,0.07267,0.11036,0.09884,0.09824
49858,1949,0.75900,2021-10-05 00:57:00,2235.42993,1425.38989,0.00000,3.74571,4.88233,16.11052,14.16929,...,0.16743,-0.03034,0.09093,0.09325,0.09936,0.07983,0.08169,0.10508,0.10837,0.10906
49859,1949,0.75900,2021-10-05 00:58:00,2250.75171,1432.30359,0.00000,3.74886,4.88551,16.09573,14.16330,...,0.16743,-0.04386,0.09612,0.13265,0.09834,0.08014,0.08567,0.10680,0.10603,0.10089


In [18]:
tdf = df.groupby(['id'])['timeseries'].agg(['min', 'max']).reset_index()
tdf.columns = ['id', 'start_date', 'end_date']
tdf['processing_time_mins'] = ((tdf['end_date'] - tdf['start_date'])/pd.Timedelta(minutes = 1))+1
df = tdf.merge(df, how = 'right')
df.insert(5, 'timestamp_index', df.groupby('id').cumcount())
df[['id', 'start_date', 'end_date', 'processing_time_mins', 'result']].drop_duplicates().reset_index(drop =True)

Unnamed: 0,id,start_date,end_date,processing_time_mins,result
0,1511,2021-08-12 15:45:00,2021-08-13 10:35:00,1131.0,0.773
1,1512,2021-08-14 23:30:00,2021-08-15 23:00:00,1411.0,0.802
2,1572,2021-08-16 14:16:00,2021-08-17 13:29:00,1394.0,0.764
3,1574,2021-08-18 19:30:00,2021-08-19 19:09:00,1420.0,0.74
4,1575,2021-08-20 00:00:00,2021-08-21 02:29:00,1590.0,0.758
5,1577,2021-08-22 20:40:00,2021-08-23 18:29:00,1310.0,0.733
6,1578,2021-08-23 22:50:00,2021-08-24 21:09:00,1340.0,0.725
7,1579,2021-08-25 00:22:00,2021-08-25 22:29:00,1328.0,0.703
8,1580,2021-08-26 15:14:00,2021-08-27 11:29:00,1216.0,0.754
9,1581,2021-08-27 17:02:00,2021-08-28 11:39:00,1118.0,0.741


In [19]:
# for i in range(0, 101, 1):    
#     print(df['DIAFTOTALE'].quantile(i/1000))
df.loc[df['DIAFTOTALE'] > 3, 'DIAFTOTALE'] = 2.59
df.loc[df['DIAFTOTALE'] < -1, 'DIAFTOTALE'] = -0.4

In [20]:
df

Unnamed: 0,id,start_date,end_date,processing_time_mins,result,timestamp_index,timeseries,101LI636,101WI610,306LI606,...,108PI659,108PI662,108PI663,108FI653,108FI657,108FI665,108FI669,108FI673,108FI677,108FI681
0,1511,2021-08-12 15:45:00,2021-08-13 10:35:00,1131.00000,0.77300,0,2021-08-12 15:45:00,81477.29688,21599.61719,0.00000,...,0.19053,-0.00131,0.11143,0.07196,0.09553,0.08337,0.09170,0.10912,0.11009,0.10120
1,1511,2021-08-12 15:45:00,2021-08-13 10:35:00,1131.00000,0.77300,1,2021-08-12 15:46:00,81779.82813,22878.83203,0.00000,...,0.19943,-0.00977,0.11113,0.13773,0.10828,0.08770,0.08412,0.11635,0.11269,0.11215
2,1511,2021-08-12 15:45:00,2021-08-13 10:35:00,1131.00000,0.77300,2,2021-08-12 15:47:00,82196.60938,24160.41211,0.00000,...,0.19490,-0.00120,0.10793,0.03334,0.10950,0.09661,0.09661,0.10159,0.11203,0.09815
3,1511,2021-08-12 15:45:00,2021-08-13 10:35:00,1131.00000,0.77300,3,2021-08-12 15:48:00,82719.15625,25430.91016,0.00000,...,0.19552,-0.00595,0.10315,0.03987,0.10044,0.09077,0.09199,0.10551,0.10990,0.09696
4,1511,2021-08-12 15:45:00,2021-08-13 10:35:00,1131.00000,0.77300,4,2021-08-12 15:49:00,83419.19531,26682.87500,0.00000,...,0.19390,-0.00408,0.11148,0.10552,0.09854,0.08863,0.08935,0.11219,0.10966,0.09689
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49856,1949,2021-10-04 03:20:00,2021-10-05 00:59:00,1300.00000,0.75900,1295,2021-10-05 00:55:00,2231.38086,1426.05713,0.00000,...,0.16151,-0.04031,0.09843,0.04526,0.09306,0.08371,0.08335,0.10280,0.10907,0.09397
49857,1949,2021-10-04 03:20:00,2021-10-05 00:59:00,1300.00000,0.75900,1296,2021-10-05 00:56:00,2231.68579,1436.05176,0.00000,...,0.16558,-0.04496,0.09614,0.12657,0.09976,0.07932,0.07267,0.11036,0.09884,0.09824
49858,1949,2021-10-04 03:20:00,2021-10-05 00:59:00,1300.00000,0.75900,1297,2021-10-05 00:57:00,2235.42993,1425.38989,0.00000,...,0.16743,-0.03034,0.09093,0.09325,0.09936,0.07983,0.08169,0.10508,0.10837,0.10906
49859,1949,2021-10-04 03:20:00,2021-10-05 00:59:00,1300.00000,0.75900,1298,2021-10-05 00:58:00,2250.75171,1432.30359,0.00000,...,0.16743,-0.04386,0.09612,0.13265,0.09834,0.08014,0.08567,0.10680,0.10603,0.10089


In [21]:
# df.drop(columns=['progress_perc'],inplace=True)
# Insert the progress column which measures progress in percentage terms
df.insert(6, 'progress_perc', round(((df.timestamp_index / df.processing_time_mins)*100),0).astype(int))

In [22]:
# 0.9 - 82
# 0.8 - 118
# 0.7 - 132
df_7 = udf.eliminate_corr(df,thresh = 0.7)

# of columns before dropping correlated variables:  271
# of columns to drop 127
Dropped columns after correlation analysis: ['progress_perc', '101LI636', '306LI606.1', '158PI651', '158PI653', '158PI655', '158PI663', '159PI651', '159PI653', '159PI655', '159PI663', '160PI651', '160PI653', '160PI655', '160PI663', '161PI651', '161PI653', '161PI655', '161PI663', '162FI652', '162FI667', '162PI651', '162PI653', '162PI655', '162PI663', '163FI652', '163FI667', '163PI651', '163PI653', '163PI655', '163PI663', '163TI650', '164FI652', '165PI651', '165PI653', '165PI655', '165PI663', '159CAL4552', '161CAL4552', '162CAL4552', '163CAL4551', '163CAL4552', '164CAL4552', '165CAL4552', 'DWTOTALE', 'PERMTOTALE', 'DIAFTOTALE', '158TI670', '118PI917', '118TI609', '118TI972', '118TI973', '118SI623', '118SI633', '118SI643', '118FI638', '118AI621', '118AI616', '118FI601', '118FI602.1', '109FI616', '118FI982', '118FI606', '118CV502', '118CV501', '890PI610', '118FQ602', '118FI912', '118PI629', '118PI628', '118PI9

In [23]:
df_7.insert(6, 'progress_perc', round(((df_7.timestamp_index / df_7.processing_time_mins)*100),0).astype(int))

In [24]:
df_7.columns

Index(['id', 'start_date', 'end_date', 'processing_time_mins', 'result',
       'timestamp_index', 'progress_perc', 'timeseries', '101WI610',
       '306LI606',
       ...
       '107AI610', '107TI652', '107PI650', '107FI681A', '107AI672', '108AI610',
       '107AI677A', '108AI613', '108TI652', '108FI681'],
      dtype='object', length=145)

In [25]:
df_std= df_7.groupby(
            ['id', 'start_date', 'end_date', 'processing_time_mins', 'result', 'progress_perc']
            )[df_7.columns[8:].tolist()].mean().reset_index()

In [26]:
x_df = df_std.groupby(['id']).describe()
x_df.columns = ['_'.join(col).strip() for col in x_df.columns.values]
x_df.reset_index(inplace=True)
x_df = x_df[x_df.columns.drop(list(x_df.filter(regex='_count')))]
x_df

Unnamed: 0,id,processing_time_mins_mean,processing_time_mins_std,processing_time_mins_min,processing_time_mins_25%,processing_time_mins_50%,processing_time_mins_75%,processing_time_mins_max,result_mean,result_std,...,108TI652_50%,108TI652_75%,108TI652_max,108FI681_mean,108FI681_std,108FI681_min,108FI681_25%,108FI681_50%,108FI681_75%,108FI681_max
0,1511,1131.0,0.0,1131.0,1131.0,1131.0,1131.0,1131.0,0.773,0.0,...,24.06955,24.12285,24.2263,0.10191,0.00236,0.09668,0.10014,0.10187,0.10355,0.10707
1,1512,1411.0,0.0,1411.0,1411.0,1411.0,1411.0,1411.0,0.802,0.0,...,22.21137,23.1699,40.40574,0.10161,0.00273,0.09497,0.10024,0.10119,0.10325,0.10847
2,1572,1394.0,0.0,1394.0,1394.0,1394.0,1394.0,1394.0,0.764,0.0,...,23.63868,24.40141,40.47026,0.09633,0.00586,0.08479,0.09231,0.09624,0.10072,0.10859
3,1574,1420.0,0.0,1420.0,1420.0,1420.0,1420.0,1420.0,0.74,0.0,...,15.98552,19.09484,39.68801,0.09432,0.00491,0.08494,0.08996,0.09493,0.09794,0.10364
4,1575,1590.0,0.0,1590.0,1590.0,1590.0,1590.0,1590.0,0.758,0.0,...,16.07552,17.98729,40.73367,0.5888,0.74234,0.08716,0.09683,0.10211,0.85488,3.6219
5,1577,1310.0,0.0,1310.0,1310.0,1310.0,1310.0,1310.0,0.733,0.0,...,19.2448,21.90539,40.60771,0.09897,0.00281,0.09367,0.09691,0.09831,0.10134,0.10446
6,1578,1340.0,0.0,1340.0,1340.0,1340.0,1340.0,1340.0,0.725,0.0,...,21.39254,21.74301,40.10335,0.26001,0.65964,0.09037,0.09435,0.09614,0.09884,4.23019
7,1579,1328.0,0.0,1328.0,1328.0,1328.0,1328.0,1328.0,0.703,0.0,...,20.81791,21.50258,22.12421,0.09649,0.00235,0.09098,0.09509,0.09636,0.09797,0.10254
8,1580,1216.0,0.0,1216.0,1216.0,1216.0,1216.0,1216.0,0.754,0.0,...,12.93851,12.99706,41.22321,0.09741,0.00248,0.09181,0.09549,0.09737,0.09899,0.10264
9,1581,1118.0,0.0,1118.0,1118.0,1118.0,1118.0,1118.0,0.741,0.0,...,14.60525,14.63736,40.11726,0.09591,0.00314,0.08787,0.09345,0.09524,0.09832,0.10417


In [27]:
# Suspected change points
change_point_vars = df_std.columns[5:].insert(0, 'id')
df_change_points = df_std[df_std['progress_perc'].isin([0,5,10,20,25,50,75,80,85,90,95,99,100])]
df_change_points = df_change_points[change_point_vars]
# df_change_points
melt_df = pd.melt(df_change_points, id_vars=['id', 'progress_perc'])
melt_df['variable'] = melt_df['variable'].astype(str) + '_' + melt_df['progress_perc'].astype(str)
melt_df.drop(columns= ['progress_perc'], inplace = True)
df_change_points = melt_df.pivot(index="id", columns="variable", values="value")
df_change_points.reset_index(inplace=True)

In [28]:
fdf = x_df.merge(df_change_points, on= 'id')
fdf

Unnamed: 0,id,processing_time_mins_mean,processing_time_mins_std,processing_time_mins_min,processing_time_mins_25%,processing_time_mins_50%,processing_time_mins_75%,processing_time_mins_max,result_mean,result_std,...,CONCTOTALE_20,CONCTOTALE_25,CONCTOTALE_5,CONCTOTALE_50,CONCTOTALE_75,CONCTOTALE_80,CONCTOTALE_85,CONCTOTALE_90,CONCTOTALE_95,CONCTOTALE_99
0,1511,1131.0,0.0,1131.0,1131.0,1131.0,1131.0,1131.0,0.773,0.0,...,0.31645,0.3319,-2.71153,0.32228,0.32209,0.36068,0.37681,1.0087,1.00939,1.00879
1,1512,1411.0,0.0,1411.0,1411.0,1411.0,1411.0,1411.0,0.802,0.0,...,0.29023,0.28872,0.28995,0.2911,0.29068,0.28827,0.30788,0.31816,0.36027,0.37365
2,1572,1394.0,0.0,1394.0,1394.0,1394.0,1394.0,1394.0,0.764,0.0,...,0.3169,0.31848,0.40376,0.36853,0.36854,0.40288,0.44613,1.00877,1.00765,1.00532
3,1574,1420.0,0.0,1420.0,1420.0,1420.0,1420.0,1420.0,0.74,0.0,...,0.40047,0.39833,0.39835,0.41553,0.41134,0.41452,0.41804,0.64582,0.99777,0.99396
4,1575,1590.0,0.0,1590.0,1590.0,1590.0,1590.0,1590.0,0.758,0.0,...,0.32658,0.32573,0.32797,0.32695,0.44377,0.48391,1.00035,0.99839,0.99778,0.99794
5,1577,1310.0,0.0,1310.0,1310.0,1310.0,1310.0,1310.0,0.733,0.0,...,0.33234,0.35132,0.38664,0.35664,0.40239,0.40308,0.40479,1.00256,1.00358,1.00251
6,1578,1340.0,0.0,1340.0,1340.0,1340.0,1340.0,1340.0,0.725,0.0,...,0.3713,0.373,0.42475,0.47138,0.39906,0.39585,0.38959,0.99613,0.99592,0.99393
7,1579,1328.0,0.0,1328.0,1328.0,1328.0,1328.0,1328.0,0.703,0.0,...,0.38101,0.38323,0.3899,0.44003,0.47886,0.48787,0.4828,0.49111,1.01732,1.01357
8,1580,1216.0,0.0,1216.0,1216.0,1216.0,1216.0,1216.0,0.754,0.0,...,0.43561,0.42424,0.43185,0.46121,0.42567,0.44905,0.4478,0.46794,1.00933,1.00832
9,1581,1118.0,0.0,1118.0,1118.0,1118.0,1118.0,1118.0,0.741,0.0,...,0.43005,0.40285,0.4894,0.43134,0.43295,0.42867,0.43408,1.00838,1.007,1.00693


In [30]:
fdf.columns[:30]

Index(['id', 'processing_time_mins_mean', 'processing_time_mins_std',
       'processing_time_mins_min', 'processing_time_mins_25%',
       'processing_time_mins_50%', 'processing_time_mins_75%',
       'processing_time_mins_max', 'result_mean', 'result_std', 'result_min',
       'result_25%', 'result_50%', 'result_75%', 'result_max',
       'progress_perc_mean', 'progress_perc_std', 'progress_perc_min',
       'progress_perc_25%', 'progress_perc_50%', 'progress_perc_75%',
       'progress_perc_max', '101WI610_mean', '101WI610_std', '101WI610_min',
       '101WI610_25%', '101WI610_50%', '101WI610_75%', '101WI610_max',
       '306LI606_mean'],
      dtype='object')

In [40]:
df_new=fdf.drop(['processing_time_mins_mean', 'processing_time_mins_std',
       'processing_time_mins_min', 'processing_time_mins_25%',
       'processing_time_mins_50%', 'processing_time_mins_75%',
       'processing_time_mins_max', 'result_mean', 'result_std', 'result_min',
       'result_25%', 'result_50%', 'result_75%', 'result_max',
       'progress_perc_mean', 'progress_perc_std', 'progress_perc_min',
       'progress_perc_25%', 'progress_perc_50%', 'progress_perc_75%',
       'progress_perc_max'],axis=1)

In [41]:
df_new

Unnamed: 0,id,101WI610_mean,101WI610_std,101WI610_min,101WI610_25%,101WI610_50%,101WI610_75%,101WI610_max,306LI606_mean,306LI606_std,...,CONCTOTALE_20,CONCTOTALE_25,CONCTOTALE_5,CONCTOTALE_50,CONCTOTALE_75,CONCTOTALE_80,CONCTOTALE_85,CONCTOTALE_90,CONCTOTALE_95,CONCTOTALE_99
0,1511,13460.77362,17275.83736,1312.10546,1330.81072,1347.70573,15831.36514,56869.1392,0.0,0.0,...,0.31645,0.3319,-2.71153,0.32228,0.32209,0.36068,0.37681,1.0087,1.00939,1.00879
1,1512,30658.90245,32629.25831,1407.97249,1577.22771,23398.32917,57413.01563,98316.34989,0.0,0.0,...,0.29023,0.28872,0.28995,0.2911,0.29068,0.28827,0.30788,0.31816,0.36027,0.37365
2,1572,58439.85289,17694.21717,18136.58398,50241.52288,50308.50502,67612.44866,98501.61551,1858.32416,5640.75012,...,0.3169,0.31848,0.40376,0.36853,0.36854,0.40288,0.44613,1.00877,1.00765,1.00532
3,1574,33688.22369,33605.15596,1270.1675,1429.06422,21077.82101,62922.0865,91801.70536,2161.58787,8273.83161,...,0.40047,0.39833,0.39835,0.41553,0.41134,0.41452,0.41804,0.64582,0.99777,0.99396
4,1575,55587.0352,20430.7007,39126.76228,39258.45104,39713.79517,72965.4624,93491.93164,2994.10024,8124.38343,...,0.32658,0.32573,0.32797,0.32695,0.44377,0.48391,1.00035,0.99839,0.99778,0.99794
5,1577,57058.40588,35989.48032,1399.24512,19240.8744,56005.1845,96329.7846,98501.25841,3022.11311,8717.07132,...,0.33234,0.35132,0.38664,0.35664,0.40239,0.40308,0.40479,1.00256,1.00358,1.00251
6,1578,51853.50636,35391.17899,1423.26499,24825.53861,55475.75511,79093.16406,96443.60435,3356.49012,9934.67192,...,0.3713,0.373,0.42475,0.47138,0.39906,0.39585,0.38959,0.99613,0.99592,0.99393
7,1579,17634.17522,23822.73067,1245.13251,1375.78975,1568.79735,28747.97656,69813.78683,0.0,0.0,...,0.38101,0.38323,0.3899,0.44003,0.47886,0.48787,0.4828,0.49111,1.01732,1.01357
8,1580,14383.19789,20784.58342,1272.5639,1298.59734,1331.08653,20827.87184,62111.78971,0.0,0.0,...,0.43561,0.42424,0.43185,0.46121,0.42567,0.44905,0.4478,0.46794,1.00933,1.00832
9,1581,12324.65744,17505.46202,1119.78308,1176.7793,1262.01829,18731.27557,54611.90234,0.0,0.0,...,0.43005,0.40285,0.4894,0.43134,0.43295,0.42867,0.43408,1.00838,1.007,1.00693


In [42]:
df_new=tdf.merge(def_new, how='inner')

In [44]:
df_new.shape

(38, 2082)

In [46]:
df_new.to_csv("../../../data/input/suanfarma_train_data.csv")