# Unique Plate Stability - 01 Machine Learning - Data Preparation

## Setup

### Confidentiality

**All information in this document is strictly confidiental**
**Copyright (C) 2019 HES-SO Valais-Wallis - All Rights Reserved**

### Import sub-modules

In [398]:
# Import required sub-modules

# python
import sys
import os
import enum
import datetime

# iPython
import IPython
from IPython.display import display
from IPython.display import Image

# pandas
import pandas as pd

# numpy
import numpy as np

# plotly
import plotly as ply
import plotly.figure_factory as ff
ply.offline.init_notebook_mode(connected=True)
import plotly.io as pio

# scikit
import sklearn as sklearn
#from sklearn import preprocessing
#from sklearn.tree import DecisionTreeRegressor

# watermark
import watermark
%load_ext watermark
%watermark -a "Silvan Zahno" -d -v -iv -m -h

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
numpy     1.16.2
plotly    3.8.1
sklearn   0.20.3
IPython   7.4.0
watermark 1.8.1
pandas    0.24.2
Silvan Zahno 2019-07-02 

CPython 3.7.3
IPython 7.4.0

compiler   : MSC v.1915 64 bit (AMD64)
system     : Windows
release    : 10
machine    : AMD64
processor  : Intel64 Family 6 Model 142 Stepping 10, GenuineIntel
CPU cores  : 8
interpreter: 64bit
host name  : WE6996


### Configurations

In [399]:
# Setup local input directories
inputDir = "input/"

if (os.path.exists(inputDir)) is False:
    os.makedirs(inputDir)
if (os.path.isdir(inputDir)) is False:
    raise NotADirectoryError("{} is not a directory".format(inputDir))

In [400]:
# Setup local input directories
outputDir = "output/"

if (os.path.exists(outputDir)) is False:
    os.makedirs(outputDir)
if (os.path.isdir(outputDir)) is False:
    raise NotADirectoryError("{} is not a directory".format(outputDir))

In [401]:
# Graph output Options
class GraphOutputOption(enum.Enum):
    none = 'none'                     # Do not generate any plots
    inline = 'inline'                 # Generate inline plots only
    htmlFile = 'htmlFile'             # Generate plots in external HTML files
    both = 'both'                     # Generate plots both inline and in external html files
    
class GraphInteractionOption(enum.Enum):
    static = 'static'                 # Generate static inline plots (as images)
    interactive = 'interactive'       # Generate interactive inline plots

notebookGraphingInteraction = GraphInteractionOption('static')
notebookGraphingOutputs = GraphOutputOption('both')

staticImageSize = {'width':1000, 'height':500, 'scale':1}

GraphAutoOpenHTML = False              # Auto open external HTML files [True/False]

In [402]:
# Pandas output options
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 70)
pd.set_option('display.width', 200)

### Common functions

## Data Import

In [403]:
dirlist = os.listdir(inputDir)
print("Files from directory ", inputDir)
for file in dirlist:
  print("  * ",file)


Files from directory  input/
  *  .ipynb_checkpoints
  *  Data process USP v1_1.xlsx
  *  Data process USP v1_2.xlsx
  *  ExportArseneFiltreFinal20190301 v1_2.csv
  *  ExportArseneFiltreFinal20190301 v1_2_1.csv
  *  ExportArseneFiltreFinal20190301.xlsx


### CSV to pandas DF

In [404]:
# Import CSV files into a pandas dataframe
uspExcelExportFileName = 'ExportArseneFiltreFinal20190301 v1_2_1.csv'
uspExcelExportFilePath = inputDir + uspExcelExportFileName
rawUspDf = pd.read_csv(uspExcelExportFilePath, sep=';')

rawUspDf.head()

Unnamed: 0,SK_LOT,SK_TRACE,OF,BARRE,PLATEAU,LOT,ALLIAGE_INTERNE,ETAT,EPAISSEUR_FINALE,SOUSFAMILLE,FOUR_COULEE,COMPO_SI,COMPO_FE,COMPO_CU,COMPO_MN,COMPO_MG,COMPO_CR,COMPO_NI,COMPO_ZN,COMPO_TI,COMPO_ZR,PRECHAUFFAGE_NUMERO_FOUR,PRECHAUFFAGE_POSITION_FOUR,PRECHAUFFAGE_T_C_AIR_MAXI,TR_DATE_POTENCE,TR_T_C_POTENCE,TR_EPAISSEUR_POTENCE,TR_NOMBRE_EBAUCHE,TR_NOMBRE_PASSE,TR_EPAISSEUR_DERNIERE_PASSE,TR_VITESSE_DERNIERE_PASSE,TR_T_C_DERNIERE_PASSE,TR_VITESSE_ENGAGEMENT_DERNIERE_PASSE,TR_TPS_POTENCE_PREMIERE_PASSE,TR_REDUCTION_EPAISSEUR_DERNIERE_PASSE,...,BARRE_LONGUEUR,BARRE_LARGEUR,BARRE_EPAISSEUR,TREMPE_MACHINE,TREMPE_SK_TEMPS,TREMPE_DATE_DEBUT,TREMPE_DATE_FIN,TREMPE_RECETTE,TRAC_MACHINE,TRAC_SK_TEMPS,TRAC_SENS_TRACTION,TRAC_TXPREML,TRAC_TXPREMT,TRAC_TXDERNL,TRAC_TXDERNT,TRAC_PLNTRANS,TRAC_PLNLONG,TRAC_PLNTRANSTETE,TRAC_PLNTRANSMILIEU,TRAC_PLNTRANSPIED,TRAC_EPMINL,TRAC_EPMINT,TRAC_EPMAXL,TRAC_EPMAXT,REVENU_MACHINE,REVENU_DATE_DEBUT,REVENU_NOCHARGE,LABO_RM,LABO_RP02,LABO_ALGMT,RASPTAB_SK_TEMPS,RASPTAB_ECHANTILLON,RASPTAB_TIL_VAL,RASPTAB_TIT_VAL,RASPTAB_VAL_MAX_TIL_TIT
0,1208407,141800,6278466,5313129152,2494272415,U4272401,7213,T651,20.5,31,HG_TP09,0.09,0.17,1.54,0.048,2.29,0.18,0.005,5.9,0.026,0.023,F13,5,296.3,17.02.2016 04:03,395,355.0,1,20,21.42,-1136.7,422.58,-503.4,10.66,12.26,...,2500,1650,380,FTRE07,20160223,20160223010719,20160223013704,R7511,TRAC06,20160223,L,0.0,0,0.0,0,0.0,0.0,0.7,0.7,1.9,20.52,0,20.63,0,FAIR25,20160226020754,20165102.0,576.2,514.3,13.4,20160308,U4272401-J,508,78,1100
1,1208408,141800,6278466,5313129152,2494272415,U4272402,7213,T651,20.5,31,HG_TP09,0.09,0.17,1.54,0.048,2.29,0.18,0.005,5.9,0.026,0.023,F13,5,296.3,17.02.2016 04:03,395,355.0,1,20,21.42,-1136.7,422.58,-503.4,10.66,12.26,...,2500,1650,380,FTRE07,20160223,20160223010719,20160223013704,R7511,TRAC06,20160223,L,0.0,0,0.0,0,0.0,0.0,0.7,0.7,0.7,20.53,0,20.6,0,FAIR25,20160226020754,20165102.0,577.6,515.7,12.9,20160308,U4272402-J,589,159,1100
2,1208409,141800,6278466,5313129152,2494272415,U4272403,7213,T651,20.5,31,HG_TP09,0.09,0.17,1.54,0.048,2.29,0.18,0.005,5.9,0.026,0.023,F13,5,296.3,17.02.2016 04:03,395,355.0,1,20,21.42,-1136.7,422.58,-503.4,10.66,12.26,...,2500,1650,380,FTRE07,20160223,20160223013536,20160223020456,R7511,TRAC06,20160223,L,0.0,0,0.0,0,0.0,0.0,0.7,0.7,0.7,20.51,0,20.59,0,FAIR25,20160226020754,20165102.0,576.0,513.7,12.9,20160308,U4272403-J,597,104,1100
3,1208410,141800,6278466,5313129152,2494272415,U4272404,7213,T651,20.5,31,HG_TP09,0.09,0.17,1.54,0.048,2.29,0.18,0.005,5.9,0.026,0.023,F13,5,296.3,17.02.2016 04:03,395,355.0,1,20,21.42,-1136.7,422.58,-503.4,10.66,12.26,...,2500,1650,380,FTRE07,20160223,20160223013536,20160223020456,R7511,TRAC06,20160223,L,0.0,0,0.0,0,0.0,0.0,0.7,0.7,0.7,20.5,0,20.61,0,FAIR25,20160226020754,20165102.0,579.1,516.6,13.9,20160308,U4272404-J,630,114,1100
4,1206475,141800,6278465,5313129122,2494272715,U4272701,7213,T651,22.23,31,HG_TP09,0.09,0.17,1.54,0.048,2.29,0.18,0.005,5.9,0.026,0.023,F47,6,307.1,14.02.2016 14:32,387,356.0,1,20,23.21,-825.2,397.75,-507.4,93.57,10.69,...,2500,1650,380,FTRE07,20160222,20160222143423,20160222150536,R7511,TRAC06,20160222,L,0.0,0,0.0,0,0.0,0.0,0.7,0.6,0.5,22.2,0,22.39,0,FAIR25,20160226020754,20165102.0,573.8,512.1,12.8,20160309,U4272701-J,954,79,850


### Count the null values in the Data Table

In [405]:
verboseCell = 1
# Check if we have any empty field in the data
if len(np.where(pd.isnull(rawUspDf))[0]) != 0:
    if verboseCell >= 0:
      print('WARNING!!: Dataframe has null (NaN or empty) cells')
    for column in rawUspDf:
        listofNullCells = np.where(pd.isnull(rawUspDf[column]))
        lenNullCells = len(listofNullCells[0])
        if lenNullCells != 0:
          if verboseCell >= 1:
            print("  * Column '{}' has {} null cells".format(column, lenNullCells))
          if verboseCell >= 2:  
            print("      {}".format(listofNullCells))
else:
    print("All good: no empty cells in Dataframe")

  * Column 'TR_TPS_POTENCE_PREMIERE_PASSE' has 368 null cells
  * Column 'TREMPE_RECETTE' has 37 null cells
  * Column 'REVENU_NOCHARGE' has 1918 null cells


In [406]:
# dropna drops missing values
if len(np.where(pd.isnull(rawUspDf))[0]) != 0:
  len_with_na = len(rawUspDf)
  cleanedUspDf = rawUspDf.dropna(axis=0)
  len_without_na = len(cleanedUspDf)
  print("Remove NA-Values from Table")
  print("  * {} Rows found".format(len_with_na))
  print("  * {} Rows with NA-Values".format(len_with_na-len_without_na))
  print("  * {} Rows without NA-Values available".format(len_without_na))
else:
  print("No duplicates found")
  
  

Remove NA-Values from Table
  * 7243 Rows found
  * 2237 Rows with NA-Values
  * 5006 Rows without NA-Values available


### Clean Duplicates

In [407]:
if (len(cleanedUspDf[cleanedUspDf.duplicated()]) > 0):
  print("Duplicates found within the Table")
  len_with_duplicates = len(cleanedUspDf)
  cleanedUspDf = cleanedUspDf.drop_duplicates()
  len_without_duplicates = len(cleanedUspDf)
  print("Remove Duplicates from Table".format(len_with_duplicates))
  print("  * {} Elements found".format(len_with_duplicates))
  print("  * {} Duplicates".format(len_with_duplicates - len_without_duplicates))
  print("  * {} Unique Elements available ".format(len_without_duplicates))
else:
  print("No Duplicates found")

No Duplicates found


### Drop Unused columns

In [408]:
# Drop all columns with only one distinct value
for col in cleanedUspDf.columns:
    if len(cleanedUspDf[col].unique()) == 1:
        print("Drop {} only value \"{}\" present".format(col, cleanedUspDf[col][0]))
        cleanedUspDf.drop(col,inplace=True,axis=1)

Drop ETAT only value "T651" present
Drop SOUSFAMILLE only value "31" present
Drop BARRE_EPAISSEUR only value "380" present
Drop TRAC_SENS_TRACTION only value "L" present
Drop TRAC_TXPREMT only value "0" present
Drop TRAC_TXDERNT only value "0" present
Drop TRAC_EPMINT only value "0" present
Drop TRAC_EPMAXT only value "0" present


In [409]:
# Drop specific columns which are not necessary
cleanedUspDf.drop("SK_LOT",inplace=True,axis=1)              # Not needed only traceability
cleanedUspDf.drop("SK_TRACE",inplace=True,axis=1)            # Not needed only traceability
cleanedUspDf.drop("PLATEAU",inplace=True,axis=1)             # Same as BARRE
cleanedUspDf.drop("LOT",inplace=True,axis=1)                 # Not needed only traceability
cleanedUspDf.drop("RASPTAB_ECHANTILLON",inplace=True,axis=1) # Not needed only traceability

### Convert datetime

In [410]:
cleanedUspDf['TR_DATE_POTENCE'] =  pd.to_datetime(cleanedUspDf['TR_DATE_POTENCE'], format='%d.%m.%Y %H:%M')
cleanedUspDf['TREMPE_SK_TEMPS'] =  pd.to_datetime(cleanedUspDf['TREMPE_SK_TEMPS'], format='%Y%m%d')
cleanedUspDf['TREMPE_DATE_DEBUT'] =  pd.to_datetime(cleanedUspDf['TREMPE_DATE_DEBUT'], format='%Y%m%d%H%M%S')
cleanedUspDf['TREMPE_DATE_FIN'] =  pd.to_datetime(cleanedUspDf['TREMPE_DATE_FIN'], format='%Y%m%d%H%M%S')
cleanedUspDf['TRAC_SK_TEMPS'] =  pd.to_datetime(cleanedUspDf['TRAC_SK_TEMPS'], format='%Y%m%d')
cleanedUspDf['REVENU_DATE_DEBUT'] =  pd.to_datetime(cleanedUspDf['REVENU_DATE_DEBUT'], format='%Y%m%d%H%M%S')
cleanedUspDf['RASPTAB_SK_TEMPS'] =  pd.to_datetime(cleanedUspDf['RASPTAB_SK_TEMPS'], format='%Y%m%d')

### Calc timestamps

In [411]:
# Perform duration calculation with the given timestamps
# TODO

## Statistical Analysis

In [412]:
desc = cleanedUspDf.describe(percentiles=[0.25, 0.5, 0.75])
desc

Unnamed: 0,OF,BARRE,ALLIAGE_INTERNE,EPAISSEUR_FINALE,COMPO_SI,COMPO_FE,COMPO_CU,COMPO_MN,COMPO_MG,COMPO_CR,COMPO_NI,COMPO_ZN,COMPO_TI,COMPO_ZR,PRECHAUFFAGE_POSITION_FOUR,PRECHAUFFAGE_T_C_AIR_MAXI,TR_T_C_POTENCE,TR_EPAISSEUR_POTENCE,TR_NOMBRE_EBAUCHE,TR_NOMBRE_PASSE,TR_EPAISSEUR_DERNIERE_PASSE,TR_VITESSE_DERNIERE_PASSE,TR_T_C_DERNIERE_PASSE,TR_VITESSE_ENGAGEMENT_DERNIERE_PASSE,TR_TPS_POTENCE_PREMIERE_PASSE,TR_REDUCTION_EPAISSEUR_DERNIERE_PASSE,TR_REDUCTION_EPAISSEUR_AVANT_DERNIERE_PASSE,TR_REDUCTION_EPAISSEUR_AVANT_AVANT_DERNIERE_PASSE,TR_REDUCTION_EPAISSEUR_MOYENNE,BARRE_LONGUEUR,BARRE_LARGEUR,TRAC_TXPREML,TRAC_TXDERNL,TRAC_PLNTRANS,TRAC_PLNLONG,TRAC_PLNTRANSTETE,TRAC_PLNTRANSMILIEU,TRAC_PLNTRANSPIED,TRAC_EPMINL,TRAC_EPMAXL,REVENU_NOCHARGE,LABO_RM,LABO_RP02,LABO_ALGMT,RASPTAB_TIL_VAL,RASPTAB_TIT_VAL,RASPTAB_VAL_MAX_TIL_TIT
count,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0,5006.0
mean,6289439.0,6941100000.0,7214.322014,22.224071,0.087461,0.149632,1.473155,0.048992,2.406689,0.186141,0.005211,5.769209,0.026101,0.018094,4.432481,383.171554,390.036756,353.828006,1.494407,23.546544,23.227219,-1009.600499,370.312799,-309.644367,166.583034,7.855491,9.578546,12.509642,14.082305,2483.949261,1617.119457,1.403987,0.784179,0.526988,0.32151,0.48332,0.582541,0.482058,16.567669,16.623767,20173940.0,576.934547,511.611025,12.532881,543.722133,251.221334,930.803036
std,7566.079,798790800.0,0.901205,7.369149,0.014143,0.017026,0.054569,0.010341,0.125393,0.004226,0.001672,0.172383,0.003305,0.004564,2.46375,81.010954,6.727569,1.761896,0.500019,1.568207,8.117834,264.606216,17.549683,193.279381,93.224256,2.666988,3.37995,3.839478,0.680393,237.070374,80.150785,0.746769,0.895939,0.836391,0.460638,0.464994,0.600974,0.477904,13.794638,13.835471,8222.421,12.86932,14.769666,0.855534,259.291914,194.570382,347.964015
min,6277498.0,5296229000.0,7213.0,13.0,0.039,0.08,1.2891,0.004,2.15,0.18,0.0022,5.1331,0.014,0.0006,1.0,0.0,369.0,326.0,1.0,18.0,13.57,-2836.0,244.67,-2750.1,10.66,1.8,1.81,2.57,8.97,1800.0,1400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20160060.0,496.61,339.05,7.7,16.0,2.0,400.0
25%,6283235.0,6185129000.0,7213.0,15.0,0.0794,0.14,1.4441,0.043,2.27,0.183,0.0044,5.74,0.024,0.0151,2.0,314.2,386.0,353.0,1.0,23.0,15.69,-1149.2,365.63,-392.9,85.155,5.79,7.03,9.1,13.92,2350.0,1650.0,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20165440.0,568.0,502.5925,12.0,351.0,101.0,650.0
50%,6288836.0,7070129000.0,7215.0,20.0,0.0884,0.15,1.477,0.05,2.47,0.1864,0.005,5.8,0.0265,0.018,4.0,444.0,390.0,354.0,1.0,24.0,20.88,-1129.0,371.23,-381.85,172.6,7.275,8.27,13.24,14.1,2500.0,1650.0,1.7,0.0,0.0,0.0,0.5,0.5,0.5,20.29,20.36,20175230.0,577.8,512.465,12.5,510.5,198.0,1100.0
75%,6294644.0,7296229000.0,7215.0,25.5,0.0952,0.16,1.51,0.0559,2.51,0.19,0.0056,5.86,0.0281,0.0212,6.0,451.0,394.0,355.0,2.0,24.0,26.47,-902.5,380.1175,-131.7,234.52,9.25,11.7,15.34,14.37,2700.0,1650.0,1.9,1.7,1.0,0.6,0.8,0.9,0.8,25.67,25.77,20175680.0,586.0025,521.4775,13.1,707.0,365.0,1300.0
max,6305550.0,9018129000.0,7215.0,40.0,0.142,0.2,1.5934,0.08,2.6,0.21,0.0329,6.1,0.0352,0.0293,10.0,740.0,415.0,358.0,2.0,36.0,90.02,2536.5,431.35,2093.6,924.78,34.47,20.06,36.66,16.68,4600.0,1650.0,2.6,2.5,7.8,2.8,3.8,4.0,3.5,40.66,40.84,20195080.0,607.7,547.32,19.3,1757.0,1982.0,1500.0


## Prepare for Machine learning

### Filter by thickness

In [413]:
print(np.sort(cleanedUspDf["EPAISSEUR_FINALE"].unique()))
print("There are {} different thicknesses".format(len(cleanedUspDf["EPAISSEUR_FINALE"].unique())))

[13.    14.    15.    15.875 15.88  16.    17.    18.    18.5   19.05
 20.    20.5   21.    21.5   22.    22.2   22.225 22.23  23.    25.
 25.4   25.5   26.    27.    28.    28.57  28.575 29.    30.    31.
 31.75  32.    35.    35.5   36.    38.    38.1   39.    39.7   40.   ]
There are 40 different thicknesses


In [414]:
filteredDfs = []
for thickness in cleanedUspDf["EPAISSEUR_FINALE"].unique():
    filteredDfs.append(cleanedUspDf[cleanedUspDf["EPAISSEUR_FINALE"] == thickness])

for df in filteredDfs:
    print("Thickness {:6} has {:6} elements".format(df["EPAISSEUR_FINALE"].values[0] ,len(df)))

Thickness   20.5 has     34 elements
Thickness  22.23 has     28 elements
Thickness   20.0 has    712 elements
Thickness   16.0 has    328 elements
Thickness   32.0 has     63 elements
Thickness   40.0 has    232 elements
Thickness   15.0 has   1284 elements
Thickness   25.0 has    386 elements
Thickness   35.0 has    213 elements
Thickness  19.05 has     29 elements
Thickness   30.0 has    433 elements
Thickness   25.5 has    194 elements
Thickness   23.0 has    309 elements
Thickness   22.0 has    105 elements
Thickness   36.0 has     27 elements
Thickness   35.5 has     17 elements
Thickness   18.5 has      8 elements
Thickness   14.0 has     20 elements
Thickness   39.0 has      4 elements
Thickness   31.0 has     34 elements
Thickness  31.75 has     22 elements
Thickness  15.88 has     30 elements
Thickness 22.225 has     25 elements
Thickness   13.0 has    146 elements
Thickness   26.0 has     40 elements
Thickness   21.0 has     75 elements
Thickness   27.0 has     25 elements
T

### Select Prediction Model

In [415]:
filteredDf = filteredDfs[6]

In [416]:
# LABO_RM
# LABO_RP02
# LABO_ALGMT
# RASPTAB_TIL_VAL
# RASPTAB_TIT_VAL
usp_response = ['RASPTAB_TIT_VAL']
y = filteredDf['RASPTAB_TIT_VAL']

### Choosing Features

In [417]:
filteredDf["EPAISSEUR_FINALE"].values[0]

15.0

In [418]:
filteredDf.columns

Index(['OF', 'BARRE', 'ALLIAGE_INTERNE', 'EPAISSEUR_FINALE', 'FOUR_COULEE', 'COMPO_SI', 'COMPO_FE', 'COMPO_CU', 'COMPO_MN', 'COMPO_MG', 'COMPO_CR', 'COMPO_NI', 'COMPO_ZN', 'COMPO_TI', 'COMPO_ZR',
       'PRECHAUFFAGE_NUMERO_FOUR', 'PRECHAUFFAGE_POSITION_FOUR', 'PRECHAUFFAGE_T_C_AIR_MAXI', 'TR_DATE_POTENCE', 'TR_T_C_POTENCE', 'TR_EPAISSEUR_POTENCE', 'TR_NOMBRE_EBAUCHE', 'TR_NOMBRE_PASSE',
       'TR_EPAISSEUR_DERNIERE_PASSE', 'TR_VITESSE_DERNIERE_PASSE', 'TR_T_C_DERNIERE_PASSE', 'TR_VITESSE_ENGAGEMENT_DERNIERE_PASSE', 'TR_TPS_POTENCE_PREMIERE_PASSE',
       'TR_REDUCTION_EPAISSEUR_DERNIERE_PASSE', 'TR_REDUCTION_EPAISSEUR_AVANT_DERNIERE_PASSE', 'TR_REDUCTION_EPAISSEUR_AVANT_AVANT_DERNIERE_PASSE', 'TR_REDUCTION_EPAISSEUR_MOYENNE', 'BARRE_LONGUEUR',
       'BARRE_LARGEUR', 'TREMPE_MACHINE', 'TREMPE_SK_TEMPS', 'TREMPE_DATE_DEBUT', 'TREMPE_DATE_FIN', 'TREMPE_RECETTE', 'TRAC_MACHINE', 'TRAC_SK_TEMPS', 'TRAC_TXPREML', 'TRAC_TXDERNL',
       'TRAC_PLNTRANS', 'TRAC_PLNLONG', 'TRAC_PLNTRANSTETE',

In [419]:
filteredDf.head()

Unnamed: 0,OF,BARRE,ALLIAGE_INTERNE,EPAISSEUR_FINALE,FOUR_COULEE,COMPO_SI,COMPO_FE,COMPO_CU,COMPO_MN,COMPO_MG,COMPO_CR,COMPO_NI,COMPO_ZN,COMPO_TI,COMPO_ZR,PRECHAUFFAGE_NUMERO_FOUR,PRECHAUFFAGE_POSITION_FOUR,PRECHAUFFAGE_T_C_AIR_MAXI,TR_DATE_POTENCE,TR_T_C_POTENCE,TR_EPAISSEUR_POTENCE,TR_NOMBRE_EBAUCHE,TR_NOMBRE_PASSE,TR_EPAISSEUR_DERNIERE_PASSE,TR_VITESSE_DERNIERE_PASSE,TR_T_C_DERNIERE_PASSE,TR_VITESSE_ENGAGEMENT_DERNIERE_PASSE,TR_TPS_POTENCE_PREMIERE_PASSE,TR_REDUCTION_EPAISSEUR_DERNIERE_PASSE,TR_REDUCTION_EPAISSEUR_AVANT_DERNIERE_PASSE,TR_REDUCTION_EPAISSEUR_AVANT_AVANT_DERNIERE_PASSE,TR_REDUCTION_EPAISSEUR_MOYENNE,BARRE_LONGUEUR,BARRE_LARGEUR,TREMPE_MACHINE,TREMPE_SK_TEMPS,TREMPE_DATE_DEBUT,TREMPE_DATE_FIN,TREMPE_RECETTE,TRAC_MACHINE,TRAC_SK_TEMPS,TRAC_TXPREML,TRAC_TXDERNL,TRAC_PLNTRANS,TRAC_PLNLONG,TRAC_PLNTRANSTETE,TRAC_PLNTRANSMILIEU,TRAC_PLNTRANSPIED,TRAC_EPMINL,TRAC_EPMAXL,REVENU_MACHINE,REVENU_DATE_DEBUT,REVENU_NOCHARGE,LABO_RM,LABO_RP02,LABO_ALGMT,RASPTAB_SK_TEMPS,RASPTAB_TIL_VAL,RASPTAB_TIT_VAL,RASPTAB_VAL_MAX_TIL_TIT
34,6277589,5335229152,7215,15.0,HG_TP09,0.1,0.18,1.49,0.053,2.47,0.19,0.0053,5.74,0.027,0.017,F49,10,322.9,2015-12-12 19:04:00,395,354.0,2,25,15.66,-1143.1,373.71,-530.3,104.12,3.95,4.95,6.08,13.53,2500,1650,FTRE06,2016-01-07,2016-01-07 02:47:02,2016-01-07 03:05:16,R4675,TRAC04,2016-01-07,1.7,1.7,2.7,0.8,0.0,0.0,0.0,0.0,0.0,FAIR09,2016-01-10 11:05:29,20165009.0,584.1,508.13,11.3,2016-01-21,191,259,1300
35,6277589,5335229152,7215,15.0,HG_TP09,0.1,0.18,1.49,0.053,2.47,0.19,0.0053,5.74,0.027,0.017,F49,10,322.9,2015-12-12 19:04:00,395,354.0,2,25,15.66,-1143.1,373.71,-530.3,104.12,3.95,4.95,6.08,13.53,2500,1650,FTRE06,2016-01-07,2016-01-07 03:05:51,2016-01-07 03:24:10,R4675,TRAC04,2016-01-07,1.7,1.7,1.3,0.8,0.0,0.0,0.0,0.0,0.0,FAIR09,2016-01-10 11:05:29,20165009.0,588.71,518.82,12.6,2016-01-21,332,207,1300
36,6277589,5335229152,7215,15.0,HG_TP09,0.1,0.18,1.49,0.053,2.47,0.19,0.0053,5.74,0.027,0.017,F49,10,322.9,2015-12-12 19:04:00,395,354.0,2,25,15.66,-1143.1,373.71,-530.3,104.12,3.95,4.95,6.08,13.53,2500,1650,FTRE06,2016-01-07,2016-01-07 02:51:11,2016-01-07 03:13:18,R4675,TRAC04,2016-01-07,1.7,1.7,1.3,0.8,0.0,0.0,0.0,0.0,0.0,FAIR09,2016-01-10 11:05:29,20165009.0,588.71,518.82,12.6,2016-01-21,332,207,1300
37,6277589,5335229152,7215,15.0,HG_TP09,0.1,0.18,1.49,0.053,2.47,0.19,0.0053,5.74,0.027,0.017,F49,10,322.9,2015-12-12 19:04:00,395,354.0,2,25,15.66,-1143.1,373.71,-530.3,104.12,3.95,4.95,6.08,13.53,2500,1650,FTRE06,2016-01-07,2016-01-07 03:51:26,2016-01-07 04:09:46,R4675,TRAC04,2016-01-07,1.7,1.7,1.2,0.8,0.0,0.0,0.0,0.0,0.0,FAIR09,2016-01-10 11:05:29,20165009.0,588.29,523.52,12.9,2016-01-21,1407,275,1300
38,6277589,5335229152,7215,15.0,HG_TP09,0.1,0.18,1.49,0.053,2.47,0.19,0.0053,5.74,0.027,0.017,F49,10,322.9,2015-12-12 19:04:00,395,354.0,2,25,15.66,-1143.1,373.71,-530.3,104.12,3.95,4.95,6.08,13.53,2500,1650,FTRE06,2016-01-07,2016-01-07 03:24:44,2016-01-07 03:43:03,R4675,TRAC04,2016-01-07,1.7,1.7,1.0,0.8,0.0,0.0,0.0,0.0,0.0,FAIR09,2016-01-10 11:05:29,20165009.0,581.69,509.07,12.3,2016-01-21,1026,39,1300


In [420]:
cleanedUspDf['TR_DATE_POTENCE'] =  pd.to_datetime(cleanedUspDf['TR_DATE_POTENCE'], format='%d.%m.%Y %H:%M')
cleanedUspDf['TREMPE_SK_TEMPS'] =  pd.to_datetime(cleanedUspDf['TREMPE_SK_TEMPS'], format='%Y%m%d')
cleanedUspDf['TREMPE_DATE_DEBUT'] =  pd.to_datetime(cleanedUspDf['TREMPE_DATE_DEBUT'], format='%Y%m%d%H%M%S')
cleanedUspDf['TREMPE_DATE_FIN'] =  pd.to_datetime(cleanedUspDf['TREMPE_DATE_FIN'], format='%Y%m%d%H%M%S')
cleanedUspDf['TRAC_SK_TEMPS'] =  pd.to_datetime(cleanedUspDf['TRAC_SK_TEMPS'], format='%Y%m%d')
cleanedUspDf['REVENU_DATE_DEBUT'] =  pd.to_datetime(cleanedUspDf['REVENU_DATE_DEBUT'], format='%Y%m%d%H%M%S')
cleanedUspDf['RASPTAB_SK_TEMPS'] =  pd.to_datetime(cleanedUspDf['RASPTAB_SK_TEMPS'], format='%Y%m%d')

In [421]:
# usp_features = ['ALLIAGE_INTERNE', 'FOUR_COULEE', 'PRECHAUFFAGE_NUMERO_FOUR', 'PRECHAUFFAGE_POSITION_FOUR', 'TREMPE_MACHINE', 'TREMPE_RECETTE', 'REVENU_MACHINE']
#usp_features = ['ALLIAGE_INTERNE', 'FOUR_COULEE', 'COMPO_SI', 'COMPO_FE', 'COMPO_CU', 'COMPO_MN', 'COMPO_MG', 'COMPO_CR', 'COMPO_NI', 'COMPO_ZN', 'COMPO_TI', 'COMPO_ZR', 'PRECHAUFFAGE_NUMERO_FOUR', 'PRECHAUFFAGE_POSITION_FOUR', 'PRECHAUFFAGE_T_C_AIR_MAXI', 'TR_DATE_POTENCE', 'TR_T_C_POTENCE', 'TR_EPAISSEUR_POTENCE', 'TR_NOMBRE_EBAUCHE', 'TR_NOMBRE_PASSE', 'TR_EPAISSEUR_DERNIERE_PASSE', 'TR_VITESSE_DERNIERE_PASSE', 'TR_T_C_DERNIERE_PASSE', 'TR_VITESSE_ENGAGEMENT_DERNIERE_PASSE', 'TR_TPS_POTENCE_PREMIERE_PASSE', 'TR_REDUCTION_EPAISSEUR_DERNIERE_PASSE', 'TR_REDUCTION_EPAISSEUR_AVANT_DERNIERE_PASSE', 'TR_REDUCTION_EPAISSEUR_AVANT_AVANT_DERNIERE_PASSE', 'TR_REDUCTION_EPAISSEUR_MOYENNE', 'BARRE_LONGUEUR', 'BARRE_LARGEUR', 'TREMPE_MACHINE', 'TREMPE_SK_TEMPS', 'TREMPE_DATE_DEBUT', 'TREMPE_DATE_FIN', 'TREMPE_RECETTE', 'TRAC_MACHINE', 'TRAC_SK_TEMPS', 'TRAC_TXPREML', 'TRAC_TXDERNL', 'TRAC_PLNTRANS', 'TRAC_PLNLONG', 'TRAC_PLNTRANSTETE', 'TRAC_PLNTRANSMILIEU', 'TRAC_PLNTRANSPIED', 'TRAC_EPMINL', 'TRAC_EPMAXL', 'REVENU_MACHINE', 'REVENU_DATE_DEBUT', 'REVENU_NOCHARGE']
usp_features = ['ALLIAGE_INTERNE', 'FOUR_COULEE', 'COMPO_SI', 'COMPO_FE', 'COMPO_CU', 'COMPO_MN', 'COMPO_MG', 'COMPO_CR', 'COMPO_NI', 'COMPO_ZN', 'COMPO_TI', 'COMPO_ZR', 'PRECHAUFFAGE_NUMERO_FOUR', 'PRECHAUFFAGE_POSITION_FOUR', 'PRECHAUFFAGE_T_C_AIR_MAXI', 'TR_T_C_POTENCE', 'TR_EPAISSEUR_POTENCE', 'TR_NOMBRE_EBAUCHE', 'TR_NOMBRE_PASSE', 'TR_EPAISSEUR_DERNIERE_PASSE', 'TR_VITESSE_DERNIERE_PASSE', 'TR_T_C_DERNIERE_PASSE', 'TR_VITESSE_ENGAGEMENT_DERNIERE_PASSE', 'TR_TPS_POTENCE_PREMIERE_PASSE', 'TR_REDUCTION_EPAISSEUR_DERNIERE_PASSE', 'TR_REDUCTION_EPAISSEUR_AVANT_DERNIERE_PASSE', 'TR_REDUCTION_EPAISSEUR_AVANT_AVANT_DERNIERE_PASSE', 'TR_REDUCTION_EPAISSEUR_MOYENNE', 'BARRE_LONGUEUR', 'BARRE_LARGEUR', 'TREMPE_MACHINE', 'TREMPE_RECETTE', 'TRAC_MACHINE', 'TRAC_TXPREML', 'TRAC_TXDERNL', 'TRAC_PLNTRANS', 'TRAC_PLNLONG', 'TRAC_PLNTRANSTETE', 'TRAC_PLNTRANSMILIEU', 'TRAC_PLNTRANSPIED', 'TRAC_EPMINL', 'TRAC_EPMAXL', 'REVENU_MACHINE', 'REVENU_NOCHARGE']
        
# By convention this is called X
X = filteredDf[usp_features]
X.head(1)

Unnamed: 0,ALLIAGE_INTERNE,FOUR_COULEE,COMPO_SI,COMPO_FE,COMPO_CU,COMPO_MN,COMPO_MG,COMPO_CR,COMPO_NI,COMPO_ZN,COMPO_TI,COMPO_ZR,PRECHAUFFAGE_NUMERO_FOUR,PRECHAUFFAGE_POSITION_FOUR,PRECHAUFFAGE_T_C_AIR_MAXI,TR_T_C_POTENCE,TR_EPAISSEUR_POTENCE,TR_NOMBRE_EBAUCHE,TR_NOMBRE_PASSE,TR_EPAISSEUR_DERNIERE_PASSE,TR_VITESSE_DERNIERE_PASSE,TR_T_C_DERNIERE_PASSE,TR_VITESSE_ENGAGEMENT_DERNIERE_PASSE,TR_TPS_POTENCE_PREMIERE_PASSE,TR_REDUCTION_EPAISSEUR_DERNIERE_PASSE,TR_REDUCTION_EPAISSEUR_AVANT_DERNIERE_PASSE,TR_REDUCTION_EPAISSEUR_AVANT_AVANT_DERNIERE_PASSE,TR_REDUCTION_EPAISSEUR_MOYENNE,BARRE_LONGUEUR,BARRE_LARGEUR,TREMPE_MACHINE,TREMPE_RECETTE,TRAC_MACHINE,TRAC_TXPREML,TRAC_TXDERNL,TRAC_PLNTRANS,TRAC_PLNLONG,TRAC_PLNTRANSTETE,TRAC_PLNTRANSMILIEU,TRAC_PLNTRANSPIED,TRAC_EPMINL,TRAC_EPMAXL,REVENU_MACHINE,REVENU_NOCHARGE
34,7215,HG_TP09,0.1,0.18,1.49,0.053,2.47,0.19,0.0053,5.74,0.027,0.017,F49,10,322.9,395,354.0,2,25,15.66,-1143.1,373.71,-530.3,104.12,3.95,4.95,6.08,13.53,2500,1650,FTRE06,R4675,TRAC04,1.7,1.7,2.7,0.8,0.0,0.0,0.0,0.0,0.0,FAIR09,20165009.0


In [422]:
X.describe()

Unnamed: 0,ALLIAGE_INTERNE,COMPO_SI,COMPO_FE,COMPO_CU,COMPO_MN,COMPO_MG,COMPO_CR,COMPO_NI,COMPO_ZN,COMPO_TI,COMPO_ZR,PRECHAUFFAGE_POSITION_FOUR,PRECHAUFFAGE_T_C_AIR_MAXI,TR_T_C_POTENCE,TR_EPAISSEUR_POTENCE,TR_NOMBRE_EBAUCHE,TR_NOMBRE_PASSE,TR_EPAISSEUR_DERNIERE_PASSE,TR_VITESSE_DERNIERE_PASSE,TR_T_C_DERNIERE_PASSE,TR_VITESSE_ENGAGEMENT_DERNIERE_PASSE,TR_TPS_POTENCE_PREMIERE_PASSE,TR_REDUCTION_EPAISSEUR_DERNIERE_PASSE,TR_REDUCTION_EPAISSEUR_AVANT_DERNIERE_PASSE,TR_REDUCTION_EPAISSEUR_AVANT_AVANT_DERNIERE_PASSE,TR_REDUCTION_EPAISSEUR_MOYENNE,BARRE_LONGUEUR,BARRE_LARGEUR,TRAC_TXPREML,TRAC_TXDERNL,TRAC_PLNTRANS,TRAC_PLNLONG,TRAC_PLNTRANSTETE,TRAC_PLNTRANSMILIEU,TRAC_PLNTRANSPIED,TRAC_EPMINL,TRAC_EPMAXL,REVENU_NOCHARGE
count,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0,1284.0
mean,7214.831776,0.086509,0.149373,1.480359,0.05059,2.478754,0.185894,0.005211,5.819483,0.026231,0.017619,4.528816,377.41285,389.521028,353.909657,1.988318,24.390966,15.70574,-1100.858723,362.381768,-300.957165,168.381301,5.543115,6.775576,11.64088,13.893707,2547.741433,1622.196262,1.772352,1.764019,1.288551,0.846636,0.0,0.0,0.0,0.0,0.0,20172700.0
std,0.549685,0.010928,0.017014,0.046141,0.009318,0.075039,0.003767,0.001088,0.092725,0.003728,0.003773,2.595027,80.056647,7.258313,1.020262,0.107493,1.209128,0.318647,358.345319,17.873143,264.171628,81.636252,0.854565,1.375628,3.465517,0.554199,204.705842,78.033141,0.21551,0.273566,0.619381,0.336945,0.0,0.0,0.0,0.0,0.0,7368.247
min,7213.0,0.06,0.11,1.3299,0.0278,2.1871,0.18,0.003,5.2848,0.014,0.01,1.0,203.5,373.0,349.0,1.0,23.0,15.48,-1199.9,267.52,-536.3,21.95,1.8,1.81,2.66,10.25,1800.0,1400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20160070.0
25%,7215.0,0.0797,0.1397,1.4468,0.044,2.47,0.1823,0.0045,5.7817,0.0238,0.0152,3.0,317.8,385.0,354.0,2.0,24.0,15.66,-1153.8,356.83,-397.5,86.57,5.3,5.7,8.0,13.57,2500.0,1650.0,1.6,1.6,0.8,0.6,0.0,0.0,0.0,0.0,0.0,20165420.0
50%,7215.0,0.0877,0.1495,1.48,0.0512,2.4928,0.1859,0.0051,5.82,0.0272,0.0174,4.0,396.8,390.0,354.0,2.0,24.0,15.67,-1147.4,365.44,-390.6,185.65,5.49,7.03,13.49,14.1,2500.0,1650.0,1.7,1.8,1.0,0.8,0.0,0.0,0.0,0.0,0.0,20175160.0
75%,7215.0,0.0912,0.16,1.5133,0.058,2.52,0.19,0.0056,5.8678,0.028425,0.0199,6.0,451.0,395.0,354.0,2.0,25.0,15.69,-1134.4,370.9,-140.4,239.96,5.79,7.3,13.79,14.1,2700.0,1650.0,1.9,1.9,1.7,1.0,0.0,0.0,0.0,0.0,0.0,20175630.0
max,7215.0,0.1145,0.18,1.5678,0.0742,2.6,0.191,0.0082,6.0,0.0347,0.0293,10.0,510.0,408.0,356.0,2.0,33.0,18.97,2536.5,417.14,2093.6,309.55,9.85,16.93,19.29,14.71,2900.0,1650.0,2.5,2.5,4.8,2.8,0.0,0.0,0.0,0.0,0.0,20195060.0


### Encoding Categorical Features

In [423]:
cleanedUspDf.dtypes

OF                                      int64
BARRE                                   int64
ALLIAGE_INTERNE                         int64
EPAISSEUR_FINALE                      float64
FOUR_COULEE                            object
COMPO_SI                              float64
COMPO_FE                              float64
COMPO_CU                              float64
COMPO_MN                              float64
COMPO_MG                              float64
COMPO_CR                              float64
COMPO_NI                              float64
COMPO_ZN                              float64
COMPO_TI                              float64
COMPO_ZR                              float64
PRECHAUFFAGE_NUMERO_FOUR               object
PRECHAUFFAGE_POSITION_FOUR              int64
PRECHAUFFAGE_T_C_AIR_MAXI             float64
TR_DATE_POTENCE                datetime64[ns]
TR_T_C_POTENCE                          int64
TR_EPAISSEUR_POTENCE                  float64
TR_NOMBRE_EBAUCHE                 

In [424]:
# Get all object coloumns which need to be changes or encoded somehow
ObjectsXDf = X.loc[:, X.dtypes == object]
NonObjectsXDf = X.loc[:, X.dtypes != object]

In [425]:
# Columns need to be transformed
cols_to_transform = ObjectsXDf.columns
cols_to_transform

Index(['FOUR_COULEE', 'PRECHAUFFAGE_NUMERO_FOUR', 'TREMPE_MACHINE', 'TREMPE_RECETTE', 'TRAC_MACHINE', 'REVENU_MACHINE'], dtype='object')

In [426]:
# Columns to keep
cols_to_retain = NonObjectsXDf.columns
cat_dict = X[ cols_to_retain ].to_dict( orient = 'records' )
#cat_dict

In [427]:
# Pandas get.dummies not so good for ML
X = pd.get_dummies(X, columns = cols_to_transform )

#from sklearn.preprocessing import OneHotEncoder
#ohe = OneHotEncoder(sparse=True, handle_unknown="ignore")
#X = X.apply(ohe.fit(X[ cols_to_retain ]))

### Split Test Train

In [428]:
from sklearn.model_selection import train_test_split
len(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

## Build the model

In [452]:
runs = 1000

### DecisionTree Regressor

In [455]:
from sklearn.tree import DecisionTreeRegressor

# Define model. Specify a number for random_state to ensure same results each run
model = DecisionTreeRegressor(random_state=1)

result = []
run = 0
for run in range(runs):
    # Fit model
    model.fit(X_train, y_train)
    
    # Validate model
    result.append(model.score(X_test, y_test))

print("After {} runs\n  * min accuracy: {}\n  * max accuracy: {}".format(run, min(result)*100.0, max(result)*100.0))

After 999 runs
  * min accuracy: 70.01859345079046
  * max accuracy: 70.01859345079046


### RandomForest

In [456]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier()
result = []
run = 0
for run in range(runs):
    # Fit model
    model.fit(X_train, y_train)
    
    # Validate model
    y_predict = model.predict(X_test)
    result.append(accuracy_score(y_test.values, y_predict))

print("After {} runs\n  * min accuracy: {}\n  * max accuracy: {}".format(run, min(result)*100.0, max(result)*100.0))


The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



After 999 runs
  * min accuracy: 49.532710280373834
  * max accuracy: 57.943925233644855
