# Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import Imputer,LabelEncoder,StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis
from sklearn.decomposition import PCA
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import KFold
from sklearn.svm import NuSVC
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.metrics import roc_curve
from sklearn.externals import joblib


%matplotlib inline 

# Load the Trainind data in Pandas dataframe

In [2]:
training_dataframe = pd.read_csv("./data/cleaned_training_data.csv")

In [3]:
training_data_matrix = training_dataframe.as_matrix()
feature_names = training_dataframe.columns[:-1]


X_train = training_data_matrix[:,:-1]
y_train = training_data_matrix[:,-1]

In [4]:
len(X_train[0])

219

# Definer the Transformer functions

In [5]:
def impute_missing_data(X):
    imp = Imputer(missing_values=np.nan,strategy='mean',axis=0,verbose=1)
    imp.fit(X)
    X_imputed = imp.transform(X)
    return X_imputed,imp

def encode_decision(y):
    encoder= LabelEncoder()
    encoder.fit(y)
    y_encoded=encoder.transform(y)
    return y_encoded,encoder

def standardize_data(X):
    scaler = StandardScaler()
    scaler.fit(X)
    X_standardized = scaler.transform(X)
    return X_standardized,scaler

def remove_outliers(X,y):
    print("Number of records before removing outliers ",X.shape[0])
    lof = LocalOutlierFactor(n_neighbors=20,n_jobs=-1)
    non_outlier_indices = lof.fit_predict(X)==1
    X_inlier = X[non_outlier_indices]
    y_inlier = y[non_outlier_indices]
    print("Number of records after removing outliers ",X_inlier.shape[0])
    return X_inlier,y_inlier

def create_orthogonal_matrix(X):
    pca = PCA(n_components=60,random_state=0)
    X_orthogonal = pca.fit_transform(X)
    print("Number of attributes before PCA ",X.shape[1])
    print("Number of attributes after PCA ",X_orthogonal.shape[1])
    return X_orthogonal,pca

def select_features(X,y):
    clf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)
    sfm = SelectFromModel(clf,threshold='mean')
    sfm.fit(X, y)
    X_important = sfm.transform(X)
    print(X_important.shape[1]," attributes are selected out of ",X.shape[1]," attributes by RF")
    return X_important,sfm

def over_sample_smote(X,y):
    smote = SMOTE(random_state=0,n_jobs=-1)
    smote.fit(X,y)
    X_resampled, y_resampled = smote.sample(X, y)
    print("Number of records before SMOTE sampling ",X.shape[0])
    print("Number of records after SMOTE sampling ",X_resampled.shape[0])
    return X_resampled,y_resampled

# Define the Random Forest Classifier

In [6]:
clf_RF = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)

# Apply all the transformations one by one

In [7]:
X = np.array(X_train)
y = np.array(y_train)
X,imputer = impute_missing_data(X)
X,y = over_sample_smote(X,y)
y,encoder = encode_decision(y)
X,scaler = standardize_data(X)
X,y = remove_outliers(X,y)
X,pca = create_orthogonal_matrix(X)
X,sfm = select_features(X,y)

Number of records before SMOTE sampling  3703
Number of records after SMOTE sampling  6480
Number of records before removing outliers  6480
Number of records after removing outliers  5832
Number of attributes before PCA  219
Number of attributes after PCA  60
20  attributes are selected out of  60  attributes by RF


# Fit the Model

In [8]:
clf_RF.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

# Save Imputer, LabelEncoder,StandardScaler, PCA, SFM and The RandomForest Model

In [9]:
joblib.dump(imputer,'./data/model/imputer.pkl')
joblib.dump(encoder,'./data/model/encoder.pkl')
joblib.dump(scaler,'./data/model/scaler.pkl')
joblib.dump(pca,'./data/model/pca.pkl')
joblib.dump(sfm,'./data/model/sfm.pkl')
joblib.dump(clf_RF, './data/model/rf.pkl') 

['./data/model/rf.pkl']