In [324]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(17,8)})
import sklearn
%matplotlib inline

#ignore warning
import warnings
warnings.filterwarnings('ignore')

#Adjust the display
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [88]:
from sklearn.model_selection import train_test_split, KFold , StratifiedKFold , cross_val_score , cross_validate , GridSearchCV ,TimeSeriesSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics        import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix , make_scorer , precision_recall_curve , roc_curve,roc_auc_score 
from sklearn.preprocessing   import LabelEncoder ,StandardScaler, MinMaxScaler , OrdinalEncoder , OneHotEncoder
import statistics
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.pipeline import make_pipeline as imb_make_pipeline
from imblearn.pipeline import Pipeline as imb_Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [4]:
df_transaction = pd.read_csv('../ieee-fraud-detection/train_transaction.csv')

In [None]:
df_transaction = pd.read_csv('../ieee-fraud-detection/train_transaction.csv')

In [None]:
df_transaction_test = pd.read_csv('../ieee-fraud-detection/test_transaction.csv')

In [421]:
origin_x , test_x , origin_y , test_y = train_test_split(df_transaction.drop(columns=['isFraud']),df_transaction['isFraud'],train_size=0.8)

In [422]:
many_null_cols = [col for col in origin_x.columns if origin_x[col].isnull().sum() / origin_x.shape[0] > 0.3]
origin_x = origin_x.drop(columns=['TransactionID','TransactionDT']+many_null_cols)


In [423]:
# 이건 해줄수 밖에 없다
card1_top_10 = origin_x['card1'].value_counts().iloc[0:9].index
card2_top_10 = origin_x['card2'].value_counts().iloc[0:9].index
card3_top_5 = origin_x['card3'].value_counts().iloc[0:5].index
card5_top_5 = origin_x['card5'].value_counts().iloc[0:5].index

origin_x['card1'] = origin_x['card1'].apply(lambda x: 'rest' if x not in card1_top_10 else x).astype(str)
origin_x['card2'] = origin_x['card2'].apply(lambda x: 'rest' if x not in card2_top_10 else x).astype(str)
origin_x['card3'] = origin_x['card3'].apply(lambda x: 'rest' if x not in card3_top_5 else x).astype(str)
origin_x['card5'] = origin_x['card5'].apply(lambda x: 'rest' if x not in card5_top_5 else x).astype(str)

origin_x['card4'] = origin_x['card4'].apply(lambda x: 'rest' if x not in ['mastercard','discover','visa','american express'] else x )
origin_x['card6'] = origin_x['card6'].apply(lambda x:'rest' if x not in ['debit','credit'] else x)

domain_top_5 = origin_x['P_emaildomain'].value_counts()[0:5]
origin_x['P_emaildomain'] = origin_x['P_emaildomain'].apply(lambda x: 'rest' if x not in domain_top_5 else x)

address1_top10 = origin_x['addr1'].value_counts().iloc[0:9]
origin_x['addr1'] = origin_x['addr1'].apply(lambda x: 'rest' if x not in address1_top10 else x).astype(str)
address2_top10 = origin_x['addr2'].value_counts().iloc[0:9]
origin_x['addr2'] = origin_x['addr2'].apply(lambda x: 'rest' if x not in address2_top10 else x).astype(str)


m6_imputer = SimpleImputer(strategy='most_frequent').fit(origin_x[['M6']])
origin_x['M6'] = m6_imputer.fit_transform(origin_x[['M6']])

In [424]:
categorical_features = ['ProductCD','card1','card2','card3','card4','card5','card6','P_emaildomain','addr1','addr2','M6']

categorical_transformer = OneHotEncoder()


numeric_features = origin_x.columns.drop(categorical_features)

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)
clf = imb_Pipeline(
    steps=[("preprocessor", preprocessor),("classifier", RandomForestClassifier(n_jobs=-1))]
)

In [417]:
from sklearn.model_selection import TimeSeriesSplit
#kfold = StratifiedKFold(n_splits=5,shuffle=True)
#kfold = TimeSeriesSplit(n_splits=5)
kfold = KFold(n_splits=5,shuffle=False)
cross_val_score(clf,origin_x,origin_y,cv=kfold,scoring='roc_auc',n_jobs=-1)

array([0.91209967, 0.91329865, 0.90792042, 0.9091546 , 0.91624206])

In [269]:
# 그렇게 차이가 나지도 않는다
from sklearn.model_selection import TimeSeriesSplit
kfold = StratifiedKFold(n_splits=5,shuffle=True)
cross_val_score(clf,origin_x,origin_y,cv=kfold,scoring='roc_auc',n_jobs=-1)

array([0.91277531, 0.90965579, 0.91002018, 0.91173567, 0.91322931])

In [425]:
test_x = test_x[origin_x.columns]
test_x['card1'] = test_x['card1'].apply(lambda x: 'rest' if x not in card1_top_10 else x).astype(str)
test_x['card2'] = test_x['card2'].apply(lambda x: 'rest' if x not in card2_top_10 else x).astype(str)
test_x['card3'] = test_x['card3'].apply(lambda x: 'rest' if x not in card3_top_5 else x).astype(str)
test_x['card5'] = test_x['card5'].apply(lambda x: 'rest' if x not in card5_top_5 else x).astype(str)

test_x['card4'] = test_x['card4'].apply(lambda x: 'rest' if x not in ['mastercard','discover','visa','american express'] else x )
test_x['card6'] = test_x['card6'].apply(lambda x:'rest' if x not in ['debit','credit'] else x)

test_x['P_emaildomain'] = test_x['P_emaildomain'].apply(lambda x: 'rest' if x not in domain_top_5 else x)

test_x['addr1'] = test_x['addr1'].apply(lambda x: 'rest' if x not in address1_top10 else x).astype(str)
test_x['addr2'] = test_x['addr2'].apply(lambda x: 'rest' if x not in address2_top10 else x).astype(str)

test_x['M6'] = m6_imputer.transform(test_x[['M6']])

In [426]:
clf.fit(origin_x,origin_y)
roc_auc_score(test_y,clf.predict_proba(test_x)[:, 1])

0.9139948908804654

In [336]:
origin_x['card1'].append(test_x['card1'],ignore_index=True)

0          rest
1          rest
2          rest
3          rest
4          rest
           ... 
1097226    rest
1097227    rest
1097228    rest
1097229    rest
1097230    rest
Name: card1, Length: 1097231, dtype: object

In [420]:
many_null_cols = [col for col in df_transaction.columns if df_transaction[col].isnull().sum() / df_transaction.shape[0] > 0.3]
origin_x = df_transaction.drop(columns=['isFraud','TransactionID'])
origin_x = origin_x.drop(columns=many_null_cols)
origin_y = df_transaction['isFraud']

test_x = df_transaction_test[origin_x.columns]

# categorical features engineering
card1_top_10 = origin_x['card1'].value_counts().iloc[0:20].index
card2_top_10 = origin_x['card2'].value_counts().iloc[0:20].index
card3_top_5 = origin_x['card3'].value_counts().iloc[0:10].index
card5_top_5 = origin_x['card5'].value_counts().iloc[0:10].index

origin_x['card1'] = origin_x['card1'].apply(lambda x: 'rest' if x not in card1_top_10 else x).astype(str)
origin_x['card2'] = origin_x['card2'].apply(lambda x: 'rest' if x not in card2_top_10 else x).astype(str)
origin_x['card3'] = origin_x['card3'].apply(lambda x: 'rest' if x not in card3_top_5 else x).astype(str)
origin_x['card5'] = origin_x['card5'].apply(lambda x: 'rest' if x not in card5_top_5 else x).astype(str)

origin_x['card4'] = origin_x['card4'].apply(lambda x: 'rest' if x not in ['mastercard','discover','visa','american express'] else x )
origin_x['card6'] = origin_x['card6'].apply(lambda x:'rest' if x not in ['debit','credit'] else x)

domain_top_5 = origin_x['P_emaildomain'].value_counts()[0:10]
origin_x['P_emaildomain'] = origin_x['P_emaildomain'].apply(lambda x: 'rest' if x not in domain_top_5 else x)

address1_top10 = origin_x['addr1'].value_counts().iloc[0:20]
origin_x['addr1'] = origin_x['addr1'].apply(lambda x: 'rest' if x not in address1_top10 else x).astype(str)
address2_top10 = origin_x['addr2'].value_counts().iloc[0:20]
origin_x['addr2'] = origin_x['addr2'].apply(lambda x: 'rest' if x not in address2_top10 else x).astype(str)


m6_imputer = SimpleImputer(strategy='most_frequent').fit(origin_x[['M6']])
origin_x['M6'] = m6_imputer.fit_transform(origin_x[['M6']])



# set classifier using pipeline
categorical_features = ['ProductCD','card1','card2','card3','card4','card5','card6','P_emaildomain','addr1','addr2','M6']

categorical_transformer = OneHotEncoder(sparse=False,handle_unknown='ignore')


numeric_features = origin_x.columns.drop(categorical_features)

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)
clf = imb_Pipeline(
    steps=[("preprocessor", preprocessor),("undersample", RandomUnderSampler()) 
           ,("classifier", RandomForestClassifier())]
)


# test data

test_x['card1'] = test_x['card1'].apply(lambda x: 'rest' if x not in card1_top_10 else x).astype(str)
test_x['card2'] = test_x['card2'].apply(lambda x: 'rest' if x not in card2_top_10 else x).astype(str)
test_x['card3'] = test_x['card3'].apply(lambda x: 'rest' if x not in card3_top_5 else x).astype(str)
test_x['card5'] = test_x['card5'].apply(lambda x: 'rest' if x not in card5_top_5 else x).astype(str)

test_x['card4'] = test_x['card4'].apply(lambda x: 'rest' if x not in ['mastercard','discover','visa','american express'] else x )
test_x['card6'] = test_x['card6'].apply(lambda x:'rest' if x not in ['debit','credit'] else x)

test_x['P_emaildomain'] = test_x['P_emaildomain'].apply(lambda x: 'rest' if x not in domain_top_5 else x)

test_x['addr1'] = test_x['addr1'].apply(lambda x: 'rest' if x not in address1_top10 else x).astype(str)
test_x['addr2'] = test_x['addr2'].apply(lambda x: 'rest' if x not in address2_top10 else x).astype(str)

test_x['M6'] = m6_imputer.transform(test_x[['M6']])



clf.fit(origin_x,origin_y)
predict = clf.predict(test_x)
csv_df = pd.DataFrame({'TransactionID':df_transaction_test['TransactionID'],'isFraud':predict})
csv_df = csv_df.set_index('TransactionID')
csv_df.to_csv('predict.csv')