# Base methods to deal with the test cases

In [65]:
import pandas as pd
from pandas import DataFrame
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

### Base methods to deal with preprocessing data
- **fill_gaps** : *Method that deals with missing data from a dataframe*
- **normalize** : *Method that normalize data rescaling numerical values to a predefined range*
- **pca** : *Method that applies pca in a dataframe*

In [69]:
def fill_gaps(df, empty_token = '?'):
    imputer = SimpleImputer(missing_values = empty_token, strategy = 'most_frequent')
    result = imputer.fit_transform(df)
    
    result_df = DataFrame(result)
    result_df.columns = df.columns
    result_df.index = df.index
    return result_df

def normalize(df, columns, range=(0,1)):
    scaler = MinMaxScaler(feature_range=range)
    for col in columns:
        scaled_values = scaler.fit_transform(df[[col]].values.astype(float))
        df[col] = scaled_values

def pca(df, components = 70):
    pca = PCA(n_components = components)
    pca_values = pca.fit_transform(df)
    component_cols = []
    for i in range(0, components):
        component_cols.append('component_%s'%i)
    return DataFrame(data = pca_values, columns = component_cols)

def cost(fp, fn):
    return (fp * 2) + fn

# Load the train and test datasets

In [56]:
train_df = pd.read_csv("aps_training_set_sample3.csv")
test_df = pd.read_csv("aps_failure_test_set.csv")

# Balancing classes in train dataset by resampling

In [63]:
# we need to balance the classes 
positive_class = train_df[train_df['class'] == 'pos']
qtd_pos = positive_class.shape[0]
negative_class = train_df[train_df['class'] == 'neg']
qtd_neg = negative_class.shape[0]

display("Positive samples: %s - Negative samples: %s" %(qtd_pos, qtd_neg))
new_samples = min(qtd_neg, qtd_pos)
display("Using %s samples per class" % (new_samples))


train_df = pd.concat([positive_class[:new_samples], negative_class[:new_samples]])
display(train_df['class'].value_counts())
train_df.shape

'Positive samples: 1000 - Negative samples: 1000'

'Using 1000 samples per class'

pos    1000
neg    1000
Name: class, dtype: int64

(2000, 171)

# Test Case 1
- fill_gaps
- Using DecisionTreeClassifier from sklearn

In [64]:
pre_train_df = fill_gaps(train_df)
pre_test_df = fill_gaps(test_df)

x_train = pre_train_df.iloc[:, 1:].values
y_train = pre_train_df['class'].values

x_test = pre_test_df.iloc[:, 1:].values
y_test = pre_test_df['class'].values

t1_classifier = DecisionTreeClassifier()
t1_classifier.fit(x_train, y_train)

y_pred = t1_classifier.predict(x_test)
t1_accuracy = accuracy_score(y_test, y_pred)
t1_accuracy

0.9255

In [70]:
tn, fp, fn, tp = confusion_matrix(y_pred, y_test).ravel()
display("Cost: %s" % (cost(fp, fn)))

'Cost: 1212'