In [1]:
'''
This project uses data provided by ExxonMobil to create a machine learning model to determine if an oil well
is working based on sensor data.
'''

# Imports all libraries and setup
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import sklearn as sk

pd.options.display.float_format = "{:.2f}".format
pd.options.display.max_rows = 10

In [2]:
# Import Test Data
fail_test = pd.read_csv(r"equip_failures_test_set.csv")
fail_test.head(3)

Unnamed: 0,id,sensor1_measure,sensor2_measure,sensor3_measure,sensor4_measure,sensor5_measure,sensor6_measure,sensor7_histogram_bin0,sensor7_histogram_bin1,sensor7_histogram_bin2,...,sensor105_histogram_bin2,sensor105_histogram_bin3,sensor105_histogram_bin4,sensor105_histogram_bin5,sensor105_histogram_bin6,sensor105_histogram_bin7,sensor105_histogram_bin8,sensor105_histogram_bin9,sensor106_measure,sensor107_measure
0,1,66888,na,2130706438,332,0,0,0,0,0,...,544762,504820,1597028,631494,5644,5448,11096,1982,0,0
1,2,91122,na,na,na,0,0,0,0,0,...,696774,345742,939332,943744,504048,203698,287374,36566,0,0
2,3,218924,na,na,na,na,na,0,280,119070,...,1032974,866000,1645644,1154924,3549128,1550716,15900,0,na,na


In [3]:
# Import Training Data
fail_train = pd.read_csv(r"equip_failures_training_set.csv")
fail_train.head(3)

Unnamed: 0,id,target,sensor1_measure,sensor2_measure,sensor3_measure,sensor4_measure,sensor5_measure,sensor6_measure,sensor7_histogram_bin0,sensor7_histogram_bin1,...,sensor105_histogram_bin2,sensor105_histogram_bin3,sensor105_histogram_bin4,sensor105_histogram_bin5,sensor105_histogram_bin6,sensor105_histogram_bin7,sensor105_histogram_bin8,sensor105_histogram_bin9,sensor106_measure,sensor107_measure
0,1,0,76698,na,2130706438,280,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,2,0,33058,na,0,na,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,3,0,41040,na,228,100,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0


In [4]:
# Replace Training Data N/A's with 0's
clean_train = fail_train.replace('na',0)
clean_train.head(3)

Unnamed: 0,id,target,sensor1_measure,sensor2_measure,sensor3_measure,sensor4_measure,sensor5_measure,sensor6_measure,sensor7_histogram_bin0,sensor7_histogram_bin1,...,sensor105_histogram_bin2,sensor105_histogram_bin3,sensor105_histogram_bin4,sensor105_histogram_bin5,sensor105_histogram_bin6,sensor105_histogram_bin7,sensor105_histogram_bin8,sensor105_histogram_bin9,sensor106_measure,sensor107_measure
0,1,0,76698,0,2130706438,280,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,2,0,33058,0,0,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,3,0,41040,0,228,100,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0


In [5]:
# Replace Test Data N/A's with 0's
clean_test = fail_test.replace('na',0)
clean_test.head(3)

Unnamed: 0,id,sensor1_measure,sensor2_measure,sensor3_measure,sensor4_measure,sensor5_measure,sensor6_measure,sensor7_histogram_bin0,sensor7_histogram_bin1,sensor7_histogram_bin2,...,sensor105_histogram_bin2,sensor105_histogram_bin3,sensor105_histogram_bin4,sensor105_histogram_bin5,sensor105_histogram_bin6,sensor105_histogram_bin7,sensor105_histogram_bin8,sensor105_histogram_bin9,sensor106_measure,sensor107_measure
0,1,66888,0,2130706438,332,0,0,0,0,0,...,544762,504820,1597028,631494,5644,5448,11096,1982,0,0
1,2,91122,0,0,0,0,0,0,0,0,...,696774,345742,939332,943744,504048,203698,287374,36566,0,0
2,3,218924,0,0,0,0,0,0,280,119070,...,1032974,866000,1645644,1154924,3549128,1550716,15900,0,0,0


In [6]:
# Corellation filtering, removes features

threshold=0.78 #hyperparameter to change
#loop through multiple thresholds later to detemrine best threshold value

def correlation(dataset, threshold):
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr()
    num_del=0
    name_del=[]
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
                if colname in dataset.columns:
                    name_del.append(colname)
                    del dataset[colname] # deleting the column from the dataset
                    num_del+=1
    print('deleted ',num_del,'columns')
    return dataset,name_del

new=clean_train.astype('float64', copy=True, errors='raise')
new_data,name_del=correlation(new,threshold)

print(new_data.shape)
print('Deleted columns\n',name_del)

deleted  66 columns
(60000, 106)
Deleted columns
 ['sensor6_measure', 'sensor7_histogram_bin2', 'sensor7_histogram_bin4', 'sensor7_histogram_bin7', 'sensor8_measure', 'sensor13_measure', 'sensor14_measure', 'sensor15_measure', 'sensor16_measure', 'sensor24_histogram_bin3', 'sensor24_histogram_bin6', 'sensor25_histogram_bin2', 'sensor25_histogram_bin5', 'sensor26_histogram_bin0', 'sensor26_histogram_bin1', 'sensor26_histogram_bin2', 'sensor26_histogram_bin3', 'sensor26_histogram_bin4', 'sensor26_histogram_bin5', 'sensor26_histogram_bin6', 'sensor27_measure', 'sensor32_measure', 'sensor33_measure', 'sensor34_measure', 'sensor35_measure', 'sensor37_measure', 'sensor39_measure', 'sensor40_measure', 'sensor41_measure', 'sensor42_measure', 'sensor43_measure', 'sensor45_measure', 'sensor46_measure', 'sensor47_measure', 'sensor48_measure', 'sensor49_measure', 'sensor52_measure', 'sensor53_measure', 'sensor56_measure', 'sensor59_measure', 'sensor61_measure', 'sensor64_histogram_bin1', 'sensor64

In [7]:
new_data.head(5)

Unnamed: 0,id,target,sensor1_measure,sensor2_measure,sensor3_measure,sensor4_measure,sensor5_measure,sensor7_histogram_bin0,sensor7_histogram_bin1,sensor7_histogram_bin3,...,sensor101_measure,sensor102_measure,sensor103_measure,sensor105_histogram_bin5,sensor105_histogram_bin6,sensor105_histogram_bin7,sensor105_histogram_bin8,sensor105_histogram_bin9,sensor106_measure,sensor107_measure
0,1.0,0.0,76698.0,0.0,2130706438.0,280.0,0.0,0.0,0.0,0.0,...,0.0,2801180.0,2445.8,469792.0,339156.0,157956.0,73224.0,0.0,0.0,0.0
1,2.0,0.0,33058.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3477820.0,2211.76,245416.0,133654.0,81140.0,97576.0,1500.0,0.0,0.0
2,3.0,0.0,41040.0,0.0,228.0,100.0,0.0,0.0,0.0,0.0,...,0.0,1040120.0,1018.64,409564.0,320746.0,158022.0,95128.0,514.0,0.0,0.0
3,4.0,0.0,12.0,0.0,70.0,66.0,0.0,0.0,0.0,318.0,...,0.0,0.0,1.08,44.0,10.0,0.0,0.0,0.0,4.0,32.0
4,5.0,0.0,60874.0,0.0,1368.0,458.0,0.0,0.0,0.0,0.0,...,0.0,21173050.0,1116.06,347188.0,286954.0,311560.0,433954.0,1218.0,0.0,0.0


In [8]:
# Reform the sensor data into predictors(X) and results(Y)

Y = new_data.target.to_numpy()

rowTot, colTot = new_data.shape
s = rowTot,colTot


X_uncut = new_data.to_numpy()
X_uncut2 = X_uncut.astype('float64')

X = X_uncut2[:,2:]
print(X.shape) 



(60000, 104)


In [9]:
# Splits into test and train data
 
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size = 0.2, random_state = 0)

print(xTrain.shape)
print(xTest.shape)
print(yTrain.shape)
print(yTest.shape)

(48000, 104)
(12000, 104)
(48000,)
(12000,)


In [10]:
# random forest model
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_estimators=70)
model.fit(xTrain,yTrain)

score=model.score(xTest,yTest)
print(score)

0.9939166666666667


In [11]:
threshold=0.80 #hyperparameter to change
#loop through multiple thresholds later to determine best threshold value

final_test=clean_test.astype('float64', copy=True, errors='raise')

for j in name_del:
    del final_test[j]
print('Initial',final_test.shape)
final_test=final_test.iloc[:,1:]
print(final_test.shape)

Initial (16001, 105)
(16001, 104)


In [12]:
y_predicted=model.predict(final_test)
print(y_predicted)
y=0
y_new=[]
for k in y_predicted:
    intk=int(k)
    y_new.append(intk)

[0. 0. 0. ... 0. 0. 0.]


In [13]:
# Put Result into CSV file

import csv
finLen = len(y_predicted)
arr = np.arange(1, finLen+1, 1)

pdData = pd.DataFrame(y_new,columns=['target'])
pdData.index = arr
pdData.index.name = 'id'

pdData.to_csv(r'results.csv')