This Kernel uses [TPOT AutoML](https://epistasislab.github.io/tpot/) tool(open source & based on Scikit-learn)  to stochastically produces a pipeline using Genetic programming with most accuracy based on training data.
TPOT generates a python file that has recommended model, the I run the model code on the training data and then predict the target for the testing data and export the submission file. 


In [None]:
#TPOT AutoML tool
from tpot import TPOTClassifier

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np # linear algebra
np.set_printoptions(precision=2)

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas import DataFrame
#Parallelism
import joblib
from joblib import Parallel, delayed,parallel_backend


# Preprocessing, modelling and evaluating
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from sklearn.preprocessing import Imputer,MaxAbsScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,mean_squared_error, mean_absolute_error
from sklearn import svm

## Hyperopt modules
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from functools import partial


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

    Credit for: 
*  [Titanic_Kaggle](https://github.com/EpistasisLab/tpot/blob/master/tutorials/Titanic_Kaggle.ipynb)

*  [My paper using DNA Sequence Alignment](http://www.wseas.us/e-library/transactions/systems/2008/27-535.pdf)


In [None]:
%%time
df_trans = pd.read_csv('../input/train_transaction.csv', index_col='TransactionID')
df_test_trans = pd.read_csv('../input/test_transaction.csv', index_col='TransactionID')

df_id = pd.read_csv('../input/train_identity.csv', index_col='TransactionID')
df_test_id = pd.read_csv('../input/test_identity.csv', index_col='TransactionID')

sample_submission = pd.read_csv('../input/sample_submission.csv', index_col='TransactionID')

df_train = df_trans.merge(df_id, how='left', left_index=True, right_index=True)
df_test = df_test_trans.merge(df_test_id, how='left', left_index=True, right_index=True)


Categorical columns

In [None]:
cat_cols = ['id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
            'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9']

Enconding categorical features

In [None]:
%%time
for col in cat_cols:
    if col in df_train.columns:
        le = preprocessing.LabelEncoder()
        le.fit(list(df_train[col].astype(str).values) + list(df_test[col].astype(str).values))
        df_train[col] = le.transform(list(df_train[col].astype(str).values))
        df_test[col] = le.transform(list(df_test[col].astype(str).values))   


Removing high correlated features

In [None]:
%%time
X_train = df_train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT'], axis=1)
y_train = df_train.sort_values('TransactionDT')['isFraud']
X_test = df_test.sort_values('TransactionDT').drop(['TransactionDT'], axis=1)

In [None]:
del df_train

import gc
gc.collect()

In [None]:
%%time
#This call generates the Model code
#def callTPOTTrainer():
#    pipeline_optimizer = TPOTClassifier(generations=8, population_size=20, cv=3,crossover_rate=0.3, memory='auto',n_jobs=-1, early_stop=5,mutation_rate=0.7, 
#                                    random_state=42, verbosity=2, template='Selector-Transformer-Classifier')
#    pipeline_optimizer.fit(X_train.values, y_train)
#    print(pipeline_optimizer.score(X_val, y_val))
#    pipeline_optimizer.export('IEEE_Frauds_tpot_exported_pipeline_MDR_2ndround.py')
#Parallel(n_jobs=7)(delayed(callTPOTTrainer()))
#Parallel(n_jobs=7,prefer="threads")

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.tree import DecisionTreeClassifier
try:
    from sklearn.impute import SimpleImputer as Imputer
except ImportError:
    from sklearn.preprocessing import Imputer

# NOTE: Make sure that the class is labeled 'target' in the data file
features = X_train
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, y_train, random_state=42)

imputer = Imputer(strategy="median",missing_values=np.NaN)
imputer.fit(training_features)
training_features = training_features.fillna(training_features.median())#imputer.transform(training_features)
testing_features = testing_features.fillna(testing_features.median())#imputer.transform(testing_features)

# Average CV score on the training set was:0.9721091430442194
exported_pipeline = make_pipeline(
    VarianceThreshold(threshold=0.1),
    Normalizer(norm="l1"),
    DecisionTreeClassifier(criterion="entropy", max_depth=10, min_samples_leaf=5, min_samples_split=3)
)

exported_pipeline.fit(training_features, training_target)

score_from_training = exported_pipeline.score(testing_features,testing_target)
print("Score= " + str(score_from_training))

testing_feature = X_test
testing_features = testing_feature.fillna(testing_feature.median())#imputer.transform(testing_features)
#------------------ Predict for Submission ---------------------------
results = exported_pipeline.predict(testing_features)
dfIsFraud = pd.DataFrame(data={'TransactionID':X_test.index.values, 'isFraud':results})
dfIsFraud.to_csv('submission_TPOT_DecisionTreeClassifier_2ndround.csv', index=False)
