# Unique Plate Stability Machine Learning

## Setup

### Confidentiality

**All information in this document is strictly confidiental**
**Copyright (C) 2019 HES-SO Valais-Wallis - All Rights Reserved**

### Import sub-modules

In [2]:
# Import required sub-modules

# python
import sys
import os
import enum
import datetime

# iPython
import IPython
from IPython.display import display
from IPython.display import Image

# pandas
import pandas as pd

# numpy
import numpy as np

# plotly
import plotly as ply
import plotly.figure_factory as ff
ply.offline.init_notebook_mode(connected=True)
import plotly.io as pio

#scikit
import sklearn as sklearn
#from sklearn import preprocessing
#from sklearn.tree import DecisionTreeRegressor

# Verbose what we are working with
print("python: {}".format(sys.version))
print("    - os")
print("    - datetime")
print("    - enum")

print("ipython {}".format(IPython.__version__))
print("pandas: {}".format(pd.__version__))
print("numpy: {}".format(np.__version__))
print("plotly: {}".format(ply.__version__))
print("scikit: {}".format(sklearn.__version__))

python: 3.7.3 (default, Mar 27 2019, 17:13:21) [MSC v.1915 64 bit (AMD64)]
    - os
    - datetime
    - enum
ipython 7.4.0
pandas: 0.24.2
numpy: 1.16.2
plotly: 3.8.1
scikit: 0.20.3


### Configurations

In [3]:
# Setup local input directories
inputDir = "input/"

if (os.path.exists(inputDir)) is False:
    os.makedirs(inputDir)
if (os.path.isdir(inputDir)) is False:
    raise NotADirectoryError("{} is not a directory".format(inputDir))

In [4]:
# Setup local input directories
outputDir = "output/"

if (os.path.exists(outputDir)) is False:
    os.makedirs(outputDir)
if (os.path.isdir(outputDir)) is False:
    raise NotADirectoryError("{} is not a directory".format(outputDir))

In [5]:
# Graph output Options
class GraphOutputOption(enum.Enum):
    none = 'none'                     # Do not generate any plots
    inline = 'inline'                 # Generate inline plots only
    htmlFile = 'htmlFile'             # Generate plots in external HTML files
    both = 'both'                     # Generate plots both inline and in external html files
    
class GraphInteractionOption(enum.Enum):
    static = 'static'                 # Generate static inline plots (as images)
    interactive = 'interactive'       # Generate interactive inline plots

notebookGraphingInteraction = GraphInteractionOption('static')
notebookGraphingOutputs = GraphOutputOption('both')

staticImageSize = {'width':1000, 'height':500, 'scale':1}

GraphAutoOpenHTML = False              # Auto open external HTML files [True/False]

In [6]:
# Pandas output options
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 100)

### Common functions

## Data Import

In [7]:
dirlist = os.listdir(inputDir)
print("Files from directory ", inputDir)
for file in dirlist:
  print("  * ",file)


Files from directory  input/
  *  .ipynb_checkpoints
  *  Data process USP v1_1.xlsx
  *  Data process USP v1_2.xlsx
  *  ExportArseneFiltreFinal20190301 v1_2.csv
  *  ExportArseneFiltreFinal20190301 v1_2_1.csv
  *  ExportArseneFiltreFinal20190301.xlsx


### CSV to pandas DF

In [8]:
# Import CSV files into a pandas dataframe
uspExcelExportFileName = 'ExportArseneFiltreFinal20190301 v1_2_1.csv'
uspExcelExportFilePath = inputDir + uspExcelExportFileName
rawUspDf = pd.read_csv(uspExcelExportFilePath, sep=';')

rawUspDf.head()

Unnamed: 0,SK_LOT,SK_TRACE,OF,BARRE,PLATEAU,LOT,ALLIAGE_INTERNE,ETAT,EPAISSEUR_FINALE,SOUSFAMILLE,FOUR_COULEE,COMPO_SI,COMPO_FE,COMPO_CU,COMPO_MN,COMPO_MG,COMPO_CR,COMPO_NI,COMPO_ZN,COMPO_TI,COMPO_ZR,PRECHAUFFAGE_NUMERO_FOUR,PRECHAUFFAGE_POSITION_FOUR,PRECHAUFFAGE_T_C_AIR_MAXI,TR_DATE_POTENCE,...,TRAC_SENS_TRACTION,TRAC_TXPREML,TRAC_TXPREMT,TRAC_TXDERNL,TRAC_TXDERNT,TRAC_PLNTRANS,TRAC_PLNLONG,TRAC_PLNTRANSTETE,TRAC_PLNTRANSMILIEU,TRAC_PLNTRANSPIED,TRAC_EPMINL,TRAC_EPMINT,TRAC_EPMAXL,TRAC_EPMAXT,REVENU_MACHINE,REVENU_DATE_DEBUT,REVENU_NOCHARGE,LABO_RM,LABO_RP02,LABO_ALGMT,RASPTAB_SK_TEMPS,RASPTAB_ECHANTILLON,RASPTAB_TIL_VAL,RASPTAB_TIT_VAL,RASPTAB_VAL_MAX_TIL_TIT
0,1208407,141800,6278466,5313129152,2494272415,U4272401,7213,T651,20.5,31,HG_TP09,0.09,0.17,1.54,0.048,2.29,0.18,0.005,5.9,0.026,0.023,F13,5,296.3,17.02.2016 04:03,...,L,0.0,0,0.0,0,0.0,0.0,0.7,0.7,1.9,20.52,0,20.63,0,FAIR25,20160226020754,20165102.0,576.2,514.3,13.4,20160308,U4272401-J,508,78,1100
1,1208408,141800,6278466,5313129152,2494272415,U4272402,7213,T651,20.5,31,HG_TP09,0.09,0.17,1.54,0.048,2.29,0.18,0.005,5.9,0.026,0.023,F13,5,296.3,17.02.2016 04:03,...,L,0.0,0,0.0,0,0.0,0.0,0.7,0.7,0.7,20.53,0,20.6,0,FAIR25,20160226020754,20165102.0,577.6,515.7,12.9,20160308,U4272402-J,589,159,1100
2,1208409,141800,6278466,5313129152,2494272415,U4272403,7213,T651,20.5,31,HG_TP09,0.09,0.17,1.54,0.048,2.29,0.18,0.005,5.9,0.026,0.023,F13,5,296.3,17.02.2016 04:03,...,L,0.0,0,0.0,0,0.0,0.0,0.7,0.7,0.7,20.51,0,20.59,0,FAIR25,20160226020754,20165102.0,576.0,513.7,12.9,20160308,U4272403-J,597,104,1100
3,1208410,141800,6278466,5313129152,2494272415,U4272404,7213,T651,20.5,31,HG_TP09,0.09,0.17,1.54,0.048,2.29,0.18,0.005,5.9,0.026,0.023,F13,5,296.3,17.02.2016 04:03,...,L,0.0,0,0.0,0,0.0,0.0,0.7,0.7,0.7,20.5,0,20.61,0,FAIR25,20160226020754,20165102.0,579.1,516.6,13.9,20160308,U4272404-J,630,114,1100
4,1206475,141800,6278465,5313129122,2494272715,U4272701,7213,T651,22.23,31,HG_TP09,0.09,0.17,1.54,0.048,2.29,0.18,0.005,5.9,0.026,0.023,F47,6,307.1,14.02.2016 14:32,...,L,0.0,0,0.0,0,0.0,0.0,0.7,0.6,0.5,22.2,0,22.39,0,FAIR25,20160226020754,20165102.0,573.8,512.1,12.8,20160309,U4272701-J,954,79,850


### Count the null values in the Data Table

In [9]:
verboseCell = 1
# Check if we have any empty field in the data
if len(np.where(pd.isnull(rawUspDf))[0]) != 0:
    if verboseCell >= 0:
      print('WARNING!!: Dataframe has null (NaN or empty) cells')
    for column in rawUspDf:
        listofNullCells = np.where(pd.isnull(rawUspDf[column]))
        lenNullCells = len(listofNullCells[0])
        if lenNullCells != 0:
          if verboseCell >= 1:
            print("  * Column '{}' has {} null cells".format(column, lenNullCells))
          if verboseCell >= 2:  
            print("      {}".format(listofNullCells))
else:
    print("All good: no empty cells in Dataframe")

  * Column 'TR_TPS_POTENCE_PREMIERE_PASSE' has 368 null cells
  * Column 'TREMPE_RECETTE' has 37 null cells
  * Column 'REVENU_NOCHARGE' has 1918 null cells


In [10]:
# dropna drops missing values
if len(np.where(pd.isnull(rawUspDf))[0]) != 0:
  len_with_na = len(rawUspDf)
  cleanedUspDf = rawUspDf.dropna(axis=0)
  len_without_na = len(cleanedUspDf)
  print("Remove NA-Values from Table")
  print("  * {} Rows found".format(len_with_na))
  print("  * {} Rows with NA-Values".format(len_with_na-len_without_na))
  print("  * {} Rows without NA-Values available".format(len_without_na))
else:
  print("No duplicates found")
  
  

Remove NA-Values from Table
  * 7243 Rows found
  * 2237 Rows with NA-Values
  * 5006 Rows without NA-Values available


### Clean Duplicates

In [11]:
if (len(cleanedUspDf[cleanedUspDf.duplicated()]) > 0):
  print("Duplicates found within the Table")
  len_with_duplicates = len(cleanedUspDf)
  cleanedUspDf = cleanedUspDf.drop_duplicates()
  len_without_duplicates = len(cleanedUspDf)
  print("Remove Duplicates from Table".format(len_with_duplicates))
  print("  * {} Elements found".format(len_with_duplicates))
  print("  * {} Duplicates".format(len_with_duplicates - len_without_duplicates))
  print("  * {} Unique Elements available ".format(len_without_duplicates))
else:
  print("No Duplicates found")

No Duplicates found


### Convert datetime

In [12]:
cleanedUspDf['REVENU_DATE_DEBUT'] =  pd.to_datetime(cleanedUspDf['REVENU_DATE_DEBUT'], format='%Y%m%d%H%M%S')
cleanedUspDf['RASPTAB_SK_TEMPS'] =  pd.to_datetime(cleanedUspDf['RASPTAB_SK_TEMPS'], format='%Y%m%d')

In [22]:
cleanedUspDf.dtypes

SK_LOT                                 int64
SK_TRACE                               int64
OF                                     int64
BARRE                                  int64
PLATEAU                                int64
LOT                                   object
ALLIAGE_INTERNE                        int64
ETAT                                  object
EPAISSEUR_FINALE                     float64
SOUSFAMILLE                            int64
FOUR_COULEE                           object
COMPO_SI                             float64
COMPO_FE                             float64
COMPO_CU                             float64
COMPO_MN                             float64
COMPO_MG                             float64
COMPO_CR                             float64
COMPO_NI                             float64
COMPO_ZN                             float64
COMPO_TI                             float64
COMPO_ZR                             float64
PRECHAUFFAGE_NUMERO_FOUR              object
PRECHAUFFA

### Encoding Categorical Features

In [14]:
#from sklearn.preprocessing import OneHotEncoder
#cleanedUspDf.columns
ObjectsUspDf = cleanedUspDf.loc[:, cleanedUspDf.dtypes == object]
NonObjectsUspDf = cleanedUspDf.loc[:, cleanedUspDf.dtypes != object]
#print(ObjectsUspDf.columns)

cols_to_transform = ObjectsUspDf.columns
df_with_dummies = pd.get_dummies(cleanedUspDf, columns = cols_to_transform )

#cols_to_retain = NonObjectsUspDf.columns
#cat_dict = cleanedUspDf[ cols_to_retain ].to_dict( orient = 'records' )

#cols_to_drop = None
#cat_dict = cleanedUspDf.drop( cols_to_drop, axis = 1 ).to_dict( orient = 'records' )
df_with_dummies.head(10)

Unnamed: 0,SK_LOT,SK_TRACE,OF,BARRE,PLATEAU,ALLIAGE_INTERNE,EPAISSEUR_FINALE,SOUSFAMILLE,COMPO_SI,COMPO_FE,COMPO_CU,COMPO_MN,COMPO_MG,COMPO_CR,COMPO_NI,COMPO_ZN,COMPO_TI,COMPO_ZR,PRECHAUFFAGE_POSITION_FOUR,PRECHAUFFAGE_T_C_AIR_MAXI,TR_T_C_POTENCE,TR_EPAISSEUR_POTENCE,TR_NOMBRE_EBAUCHE,TR_NOMBRE_PASSE,TR_EPAISSEUR_DERNIERE_PASSE,...,RASPTAB_ECHANTILLON_Z4014903-J,RASPTAB_ECHANTILLON_Z4015101-J,RASPTAB_ECHANTILLON_Z4015108-J,RASPTAB_ECHANTILLON_Z4022401-J,RASPTAB_ECHANTILLON_Z4022402-J,RASPTAB_ECHANTILLON_Z4022501-J,RASPTAB_ECHANTILLON_Z4022502-J,RASPTAB_ECHANTILLON_Z4022701-J,RASPTAB_ECHANTILLON_Z4022702-J,RASPTAB_ECHANTILLON_Z4022703-J,RASPTAB_ECHANTILLON_Z4022704-J,RASPTAB_ECHANTILLON_Z4022801-J,RASPTAB_ECHANTILLON_Z4022802-J,RASPTAB_ECHANTILLON_Z4022803-J,RASPTAB_ECHANTILLON_Z4022804-J,RASPTAB_ECHANTILLON_Z4023001-J,RASPTAB_ECHANTILLON_Z4023002-J,RASPTAB_ECHANTILLON_Z4036601-J,RASPTAB_ECHANTILLON_Z4036604-J,RASPTAB_ECHANTILLON_Z4036801-J,RASPTAB_ECHANTILLON_Z4036802-J,RASPTAB_ECHANTILLON_Z4042901-J,RASPTAB_ECHANTILLON_Z4042902-J,RASPTAB_ECHANTILLON_Z4043101-J,RASPTAB_ECHANTILLON_Z4043102-J
0,1208407,141800,6278466,5313129152,2494272415,7213,20.5,31,0.09,0.17,1.54,0.048,2.29,0.18,0.005,5.9,0.026,0.023,5,296.3,395,355.0,1,20,21.42,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1208408,141800,6278466,5313129152,2494272415,7213,20.5,31,0.09,0.17,1.54,0.048,2.29,0.18,0.005,5.9,0.026,0.023,5,296.3,395,355.0,1,20,21.42,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1208409,141800,6278466,5313129152,2494272415,7213,20.5,31,0.09,0.17,1.54,0.048,2.29,0.18,0.005,5.9,0.026,0.023,5,296.3,395,355.0,1,20,21.42,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1208410,141800,6278466,5313129152,2494272415,7213,20.5,31,0.09,0.17,1.54,0.048,2.29,0.18,0.005,5.9,0.026,0.023,5,296.3,395,355.0,1,20,21.42,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1206475,141800,6278465,5313129122,2494272715,7213,22.23,31,0.09,0.17,1.54,0.048,2.29,0.18,0.005,5.9,0.026,0.023,6,307.1,387,356.0,1,20,23.21,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,1206476,141800,6278465,5313129122,2494272715,7213,22.23,31,0.09,0.17,1.54,0.048,2.29,0.18,0.005,5.9,0.026,0.023,6,307.1,387,356.0,1,20,23.21,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,1206477,141800,6278465,5313129122,2494272715,7213,22.23,31,0.09,0.17,1.54,0.048,2.29,0.18,0.005,5.9,0.026,0.023,6,307.1,387,356.0,1,20,23.21,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,1206478,141800,6278465,5313129122,2494272715,7213,22.23,31,0.09,0.17,1.54,0.048,2.29,0.18,0.005,5.9,0.026,0.023,6,307.1,387,356.0,1,20,23.21,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,650781,139775,6277674,5296229142,2494274115,7215,20.0,31,0.1,0.19,1.51,0.06,2.52,0.19,0.006,5.79,0.028,0.018,1,355.3,396,355.0,2,24,20.82,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,650782,139775,6277674,5296229142,2494274115,7215,20.0,31,0.1,0.19,1.51,0.06,2.52,0.19,0.006,5.79,0.028,0.018,1,355.3,396,355.0,2,24,20.82,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Prepare for Machine learning

### Select Prediction Model

In [15]:
y = cleanedUspDf.RASPTAB_TIL_VAL

### Choosing Features

In [16]:
cleanedUspDf.columns

Index(['SK_LOT', 'SK_TRACE', 'OF', 'BARRE', 'PLATEAU', 'LOT', 'ALLIAGE_INTERNE', 'ETAT',
       'EPAISSEUR_FINALE', 'SOUSFAMILLE', 'FOUR_COULEE', 'COMPO_SI', 'COMPO_FE', 'COMPO_CU',
       'COMPO_MN', 'COMPO_MG', 'COMPO_CR', 'COMPO_NI', 'COMPO_ZN', 'COMPO_TI', 'COMPO_ZR',
       'PRECHAUFFAGE_NUMERO_FOUR', 'PRECHAUFFAGE_POSITION_FOUR', 'PRECHAUFFAGE_T_C_AIR_MAXI',
       'TR_DATE_POTENCE', 'TR_T_C_POTENCE', 'TR_EPAISSEUR_POTENCE', 'TR_NOMBRE_EBAUCHE',
       'TR_NOMBRE_PASSE', 'TR_EPAISSEUR_DERNIERE_PASSE', 'TR_VITESSE_DERNIERE_PASSE',
       'TR_T_C_DERNIERE_PASSE', 'TR_VITESSE_ENGAGEMENT_DERNIERE_PASSE',
       'TR_TPS_POTENCE_PREMIERE_PASSE', 'TR_REDUCTION_EPAISSEUR_DERNIERE_PASSE',
       'TR_REDUCTION_EPAISSEUR_AVANT_DERNIERE_PASSE',
       'TR_REDUCTION_EPAISSEUR_AVANT_AVANT_DERNIERE_PASSE', 'TR_REDUCTION_EPAISSEUR_MOYENNE',
       'BARRE_LONGUEUR', 'BARRE_LARGEUR', 'BARRE_EPAISSEUR', 'TREMPE_MACHINE', 'TREMPE_SK_TEMPS',
       'TREMPE_DATE_DEBUT', 'TREMPE_DATE_FIN', 'TREMPE_RECE

In [17]:
usp_features = ['ALLIAGE_INTERNE', 'FOUR_COULEE', 'PRECHAUFFAGE_NUMERO_FOUR', 'PRECHAUFFAGE_POSITION_FOUR', 'TREMPE_MACHINE', 'TREMPE_RECETTE', 'REVENU_MACHINE']

# By convention this is called X
X = cleanedUspDf[usp_features]
X.head(1)

Unnamed: 0,ALLIAGE_INTERNE,FOUR_COULEE,PRECHAUFFAGE_NUMERO_FOUR,PRECHAUFFAGE_POSITION_FOUR,TREMPE_MACHINE,TREMPE_RECETTE,REVENU_MACHINE
0,7213,HG_TP09,F13,5,FTRE07,R7511,FAIR25


In [18]:
X.describe()

Unnamed: 0,ALLIAGE_INTERNE,PRECHAUFFAGE_POSITION_FOUR
count,5006.0,5006.0
mean,7214.322014,4.432481
std,0.901205,2.46375
min,7213.0,1.0
25%,7213.0,2.0
50%,7215.0,4.0
75%,7215.0,6.0
max,7215.0,10.0


In [19]:
X.head(1)

Unnamed: 0,ALLIAGE_INTERNE,FOUR_COULEE,PRECHAUFFAGE_NUMERO_FOUR,PRECHAUFFAGE_POSITION_FOUR,TREMPE_MACHINE,TREMPE_RECETTE,REVENU_MACHINE
0,7213,HG_TP09,F13,5,FTRE07,R7511,FAIR25


In [20]:
X.describe()

Unnamed: 0,ALLIAGE_INTERNE,PRECHAUFFAGE_POSITION_FOUR
count,5006.0,5006.0
mean,7214.322014,4.432481
std,0.901205,2.46375
min,7213.0,1.0
25%,7213.0,2.0
50%,7215.0,4.0
75%,7215.0,6.0
max,7215.0,10.0


## Build the model

In [21]:
# Define model. Specify a number for random_state to ensure same results each run
usp_model = DecisionTreeRegressor(random_state=1)

# Fit model
usp_model.fit(X, y)

NameError: name 'DecisionTreeRegressor' is not defined

## Model Validation