In [1]:
import sys
print("Python Version:", sys.version)

Python Version: 3.6.9 (default, Nov  7 2019, 10:44:02) 
[GCC 8.3.0]


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
import re
import os

## Import Data

In [6]:
import csv

# get path to processed data
data_dir = os.path.abspath('../data/processed')

with open(f'{data_dir}/sensors.csv', 'r', newline='') as csvfile:
    csv_sens = csv.reader(csvfile, delimiter=',')
    for row in csv_sens:
        sensors = row

features_df = pd.read_csv(f'{data_dir}/features.csv', index_col=0)
targets_df = pd.read_csv(f'{data_dir}/targets.csv', index_col=0)

## Basic Models

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC, LinearSVC

In [8]:
# number of important features
n_feats = 150

# results
results = {}

# Features
X = features_df

# loop through y's and get feature importances
for col in targets_df.columns.values:

    # single test result
    result = {}

    # set y's
    y = targets_df.loc[:,col]

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.25, random_state=42, stratify=y)

    # initialize model
    model = RandomForestClassifier(n_estimators=100, random_state=42)

    # fit model
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    y_test_pred = pd.Series(y_test_pred)
    y_test_pred.index = y_test.index
    y_test_pred.name = 'pred_'+col

    # store the predicted/actuals
    result['actual'] = deepcopy(y_test)
    result['predicted'] = deepcopy(y_test_pred)

    # score
    result['recall'] = recall_score(y_test, y_test_pred, average='weighted')
    result['precision'] = precision_score(y_test, y_test_pred, average='weighted')

    try:
        # feature importances
        feat_imp = model.feature_importances_
        feats = X_train.columns
        indices = np.argsort(feat_imp)[::-1]
        indices = indices[0:n_feats]

        # get the features
        result['features'] = deepcopy(feats[indices])

        # get the features summary (in order of importance)
        # split sensor name from time
        feats_df = pd.Series(feats[indices]).str.split(' ', expand=True)
        feats_df.columns = ['sensor','time']
        feats_df['time'] = feats_df['time'].astype(float)

        # summarize the times for each sensor
        feats_df = feats_df.groupby('sensor',sort=False).agg(['mean','min','max','std','count'])

        # use a multi-index so all DFs can be joined into master sensor DF
        m_ind = pd.MultiIndex.from_product([[col],feats_df.index.values])
        feats_df.index = m_ind

        # put feature summary in results
        result['sensors'] = deepcopy(feats_df)
    except AttributeError as attrib:
        print(attrib)
    
    # put result into results
    results[col] = deepcopy(result)

    #progress tracking
    print(col)

cooler_eff
valve_perc
pump_leak
accu_prs
fault_id


### Most Important Sensors and Times

In [9]:
# reclaim some ram
del X_train, y_train, X_test, y_test#, model

# put results into dataframe
master_sensor_df = pd.DataFrame()

for k in results.keys():
    if master_sensor_df.empty:
        master_sensor_df = results[k]['sensors']
    else:
        master_sensor_df = master_sensor_df.append(results[k]['sensors'])

In [10]:
master_sensor_df

Unnamed: 0_level_0,Unnamed: 1_level_0,time,time,time,time,time
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,max,std,count
cooler_eff,PS6,31.557843,2.36,59.22,18.307851,51
cooler_eff,PS2,22.365714,11.65,54.89,13.890664,14
cooler_eff,PS5,31.166458,0.35,59.22,17.759973,48
cooler_eff,EPS1,16.053333,1.55,59.16,19.835953,18
cooler_eff,PS4,38.856667,35.66,42.59,3.496031,3
cooler_eff,PS1,21.3275,12.47,54.47,12.538076,12
cooler_eff,TS4,33.5,24.0,43.0,13.435029,2
cooler_eff,FS2,29.2,29.2,29.2,,1
cooler_eff,CP,8.0,8.0,8.0,,1
valve_perc,PS1,9.723261,9.45,10.19,0.176409,46


In [11]:
# empty dataframe with 3 columns
master_score_df = pd.DataFrame(columns=['fault','precision','recall'])

# loop through all the rows
for k in results.keys():
    # value for each column
    fault = k
    prec = results[k]['precision']
    rec = results[k]['recall']
    
    # build dictionary for the row with keys=column names
    score = {'fault':k, 'precision':prec, 'recall':rec}
    
    # add the row with ignore_index =True
    master_score_df = master_score_df.append(score, ignore_index=True)

In [12]:
master_score_df

Unnamed: 0,fault,precision,recall
0,cooler_eff,1.0,1.0
1,valve_perc,1.0,1.0
2,pump_leak,0.994581,0.99449
3,accu_prs,0.99461,0.99449
4,fault_id,0.99449,0.991736
