# Introduction

Talk about the purpose of this stage of data importation and preparation. Quick summary of the topics we will be covering

# 1. Importing data

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('../Data/dataset.csv', na_values=['#EMPTY?'], delimiter=";")
print(df.head(5))

   id  cycle  setting1  setting2  setting3      s1      s2       s3       s4  \
0   1      1   -0.0007   -0.0004     100.0  518.67  641.82  1589.70  1400.60   
1   1      2    0.0019   -0.0003     100.0  518.67  642.15  1591.82  1403.14   
2   1      3   -0.0043    0.0003     100.0  518.67  642.35  1587.99  1404.20   
3   1      4    0.0007    0.0000     100.0  518.67  642.35  1582.79  1401.87   
4   1      5   -0.0019   -0.0002     100.0  518.67  642.37  1582.85  1406.22   

      s5  ...     s12      s13      s14     s15   s16  s17   s18    s19  \
0  14.62  ...  521.66  2388.02  8138.62  8.4195  0.03  392  2388  100.0   
1  14.62  ...  522.28  2388.07  8131.49  8.4318  0.03  392  2388  100.0   
2  14.62  ...  522.42  2388.03  8133.23  8.4178  0.03  390  2388  100.0   
3  14.62  ...  522.86  2388.08  8133.83  8.3682  0.03  392  2388  100.0   
4  14.62  ...  522.19  2388.04  8133.80  8.4294  0.03  393  2388  100.0   

     s20      s21  
0  39.06  23.4190  
1  39.00  23.4236  
2  38.95

In [3]:
print("Number of unique engine IDs:")
print(df["id"].unique().size)

Number of unique engine IDs:
100


# 2. Data Cleaning

## 2.A. Type Checking

In [4]:
df.dtypes

id            int64
cycle         int64
setting1    float64
setting2    float64
setting3    float64
s1          float64
s2          float64
s3          float64
s4          float64
s5          float64
s6          float64
s7          float64
s8          float64
s9          float64
s10         float64
s11         float64
s12         float64
s13         float64
s14         float64
s15         float64
s16         float64
s17           int64
s18           int64
s19         float64
s20         float64
s21         float64
dtype: object

## 2.B. Null Check

In [5]:
df.isnull().sum()

id          0
cycle       0
setting1    0
setting2    0
setting3    0
s1          0
s2          0
s3          0
s4          0
s5          0
s6          0
s7          0
s8          0
s9          0
s10         0
s11         0
s12         0
s13         0
s14         0
s15         0
s16         0
s17         0
s18         0
s19         0
s20         0
s21         0
dtype: int64

## 2.C. Data Description

In [6]:
df.describe()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,...,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0
mean,51.506568,108.807862,-9e-06,2e-06,100.0,518.67,642.680934,1590.523119,1408.933782,14.62,...,521.41347,2388.096152,8143.752722,8.442146,0.03,393.210654,2388.0,100.0,38.816271,23.289705
std,29.227633,68.88099,0.002187,0.000293,0.0,6.537152e-11,0.500053,6.13115,9.000605,3.3947e-12,...,0.737553,0.071919,19.076176,0.037505,1.556432e-14,1.548763,0.0,0.0,0.180746,0.108251
min,1.0,1.0,-0.0087,-0.0006,100.0,518.67,641.21,1571.04,1382.25,14.62,...,518.69,2387.88,8099.94,8.3249,0.03,388.0,2388.0,100.0,38.14,22.8942
25%,26.0,52.0,-0.0015,-0.0002,100.0,518.67,642.325,1586.26,1402.36,14.62,...,520.96,2388.04,8133.245,8.4149,0.03,392.0,2388.0,100.0,38.7,23.2218
50%,52.0,104.0,0.0,0.0,100.0,518.67,642.64,1590.1,1408.04,14.62,...,521.48,2388.09,8140.54,8.4389,0.03,393.0,2388.0,100.0,38.83,23.2979
75%,77.0,156.0,0.0015,0.0003,100.0,518.67,643.0,1594.38,1414.555,14.62,...,521.95,2388.14,8148.31,8.4656,0.03,394.0,2388.0,100.0,38.95,23.3668
max,100.0,362.0,0.0087,0.0006,100.0,518.67,644.53,1616.91,1441.49,14.62,...,523.38,2388.56,8293.72,8.5848,0.03,400.0,2388.0,100.0,39.43,23.6184


# 3. Data Preparation

## 3.A. TTF Calculation
TO DO: EXPLAIN IMPORTANCE OF TTF

In [7]:
df["ttf"] = df.groupby("id")["cycle"].transform("max")- df["cycle"]
print(df.head(5))

   id  cycle  setting1  setting2  setting3      s1      s2       s3       s4  \
0   1      1   -0.0007   -0.0004     100.0  518.67  641.82  1589.70  1400.60   
1   1      2    0.0019   -0.0003     100.0  518.67  642.15  1591.82  1403.14   
2   1      3   -0.0043    0.0003     100.0  518.67  642.35  1587.99  1404.20   
3   1      4    0.0007    0.0000     100.0  518.67  642.35  1582.79  1401.87   
4   1      5   -0.0019   -0.0002     100.0  518.67  642.37  1582.85  1406.22   

      s5  ...      s13      s14     s15   s16  s17   s18    s19    s20  \
0  14.62  ...  2388.02  8138.62  8.4195  0.03  392  2388  100.0  39.06   
1  14.62  ...  2388.07  8131.49  8.4318  0.03  392  2388  100.0  39.00   
2  14.62  ...  2388.03  8133.23  8.4178  0.03  390  2388  100.0  38.95   
3  14.62  ...  2388.08  8133.83  8.3682  0.03  392  2388  100.0  38.88   
4  14.62  ...  2388.04  8133.80  8.4294  0.03  393  2388  100.0  38.90   

       s21  ttf  
0  23.4190  191  
1  23.4236  190  
2  23.3442  189  
3 

## 3.B. ID Column Deletion
TO DO: EXPLAIN WHY

In [8]:
df = df.drop("id", 1)
df.head(5)

  df = df.drop("id", 1)


Unnamed: 0,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,s6,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,ttf
0,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,21.61,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,21.61,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187


# 3.C. Data Shuffling
TO DO: EXPLAIN WHY

In [9]:
df = df.sample(frac=1)
print(df.head(5))

       cycle  setting1  setting2  setting3      s1      s2       s3       s4  \
15313    155   -0.0013    0.0005     100.0  518.67  643.05  1588.40  1418.81   
2513     138    0.0032    0.0003     100.0  518.67  643.39  1604.19  1417.20   
1655      93    0.0012   -0.0002     100.0  518.67  642.64  1600.04  1412.99   
14497    160    0.0007    0.0004     100.0  518.67  643.04  1599.27  1412.75   
730       73    0.0001    0.0001     100.0  518.67  642.35  1588.43  1409.32   

          s5     s6  ...      s13      s14     s15   s16  s17   s18    s19  \
15313  14.62  21.61  ...  2388.20  8130.89  8.4631  0.03  394  2388  100.0   
2513   14.62  21.61  ...  2388.15  8129.88  8.4864  0.03  393  2388  100.0   
1655   14.62  21.61  ...  2388.16  8122.66  8.4484  0.03  394  2388  100.0   
14497  14.62  21.61  ...  2388.18  8120.11  8.4430  0.03  394  2388  100.0   
730    14.62  21.61  ...  2388.08  8137.42  8.4666  0.03  394  2388  100.0   

         s20      s21  ttf  
15313  38.74  23.1329

# 4. Exporting Data

In [10]:
df.to_csv('../Data/dataset_clean.csv', index=False)