In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(17,8)})
import sklearn
%matplotlib inline
#Adjust the display
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [96]:
from sklearn.model_selection import train_test_split, KFold , StratifiedKFold , cross_val_score , cross_validate , GridSearchCV ,TimeSeriesSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics        import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix , make_scorer , precision_recall_curve , roc_curve,roc_auc_score 
from sklearn.preprocessing   import LabelEncoder ,StandardScaler, MinMaxScaler
import statistics
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.pipeline import make_pipeline as imb_make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier

In [3]:
df_transaction = pd.read_csv('../ieee-fraud-detection/train_transaction.csv')

In [163]:
df_id = pd.read_csv('../ieee-fraud-detection/train_identity.csv')

In [43]:
df_transaction.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [203]:
X_train_val,X_test,y_train_val,y_test = train_test_split(df_transaction.drop(columns='isFraud'),df_transaction['isFraud'],train_size=0.8,shuffle=True)

In [143]:
# many_null_cols = [col for col in df_transaction.columns if df_transaction[col].isnull().sum() / df_transaction.shape[0] > 0.3]
# origin_x = df_transaction.drop(columns=['isFraud'])
# origin_x = origin_x.drop(columns=many_null_cols)
# origin_y = df_transaction['isFraud']

In [220]:
many_null_cols = [col for col in X_train_val.columns if X_train_val[col].isnull().sum() / X_train_val.shape[0] > 0.5]
origin_x = X_train_val.drop(columns=many_null_cols)
origin_y = y_train_val

In [234]:
# categorical feature들은 미리 작업해줘도 무방하다
# 새로운 테스트 데이터가 들어와도 이대로 mapping한다

card1_top_10 = origin_x['card1'].value_counts().iloc[0:9].index
card2_top_10 = origin_x['card2'].value_counts().iloc[0:9].index
origin_x['card1'] = origin_x['card1'].apply(lambda x: 'rest' if x not in card1_top_10 else x)
origin_x['card2'] = origin_x['card2'].apply(lambda x: 'rest' if x not in card1_top_10 else x)
origin_x['card4'] = origin_x['card4'].apply(lambda x: 'rest' if x not in ['mastercard','discover','visa','american express'] else x )
origin_x['card6'] = origin_x['card6'].apply(lambda x:'rest' if x not in ['debit','credit'] else x)

domain_top_5 = origin_x['P_emaildomain'].value_counts()[0:5]
origin_x['P_emaildomain'] = origin_x['P_emaildomain'].apply(lambda x: 'rest' if x not in domain_top_5 else x)

address1_top10 = origin_x['addr1'].value_counts().iloc[0:9]
origin_x['addr1'] = origin_x['addr1'].apply(lambda x: 'rest' if x not in address1_top10 else x)


origin_x['M6'] = SimpleImputer(strategy='most_frequent').fit_transform(origin_x['M6'])

In [237]:
# columns to use
v_columns = [i for i in origin_x.columns if i.startswith('V') ]
c_columns = [i for i in origin_x.columns if i.startswith('C') ]
rest_columns = ['M1','M2','M3','M6','TransactionAmt','ProductCD','card1','card2','card4','card6','addr1','P_emaildomain','D1','D4','D10','D15']

In [238]:
cat_var = ['ProductCD','card1','card2','card4','card6','P_emaildomain','addr1','M1','M2','M3','M6']
features = pd.get_dummies(origin_x[rest_columns+c_columns +v_columns],columns=cat_var)
label = origin_y

In [213]:
#kfold = StratifiedKFold(n_splits=5,shuffle=True)
kfold = TimeSeriesSplit(n_splits=2)
pipeline = imb_make_pipeline(SimpleImputer(strategy='median'),RandomUnderSampler(),RandomForestClassifier())
cross_val_score(pipeline,features,label,cv=kfold,scoring='roc_auc',n_jobs=-1)

# 값이 왜이렇게 높게 나오는걸까
# 실제로는 82프로 정도 나오는데
# 여기는 아무리 봐도 잘못된거 같다

array([0.88670153, 0.90065684])

In [208]:
test_x = X_test[origin_x.columns]
test_x['card1'] = test_x['card1'].apply(lambda x: 'rest' if x not in card1_top_10 else x)
test_x['card2'] = test_x['card2'].apply(lambda x: 'rest' if x not in card1_top_10 else x)
test_x['card4'] = test_x['card4'].apply(lambda x: 'rest' if x not in ['mastercard','discover','visa','american express'] else x )
test_x['card6'] = test_x['card6'].apply(lambda x:'rest' if x not in ['debit','credit'] else x)
test_x['P_emaildomain'] = test_x['P_emaildomain'].apply(lambda x: 'rest' if x not in domain_top_5 else x)
test_x['addr1'] = test_x['addr1'].apply(lambda x: 'rest' if x not in address1_top10 else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x['card1'] = test_x['card1'].apply(lambda x: 'rest' if x not in card1_top_10 else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x['card2'] = test_x['card2'].apply(lambda x: 'rest' if x not in card1_top_10 else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x['card4'] = test_x[

In [215]:
test_x[rest_columns+c_columns +v_columns]

Unnamed: 0,TransactionAmt,ProductCD,card1,card2,card4,card6,addr1,P_emaildomain,D1,D4,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
110447,30.000,H,rest,rest,visa,debit,rest,yahoo.com,0.0,,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
509748,209.950,W,rest,rest,visa,debit,299.0,rest,34.0,,...,0.000000,0.000000,400.850006,0.000000,419.899994,731.849976,419.899994,0.000000,481.820007,0.000000
395942,213.000,W,rest,rest,visa,debit,299.0,yahoo.com,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,226.000000,226.000000,226.000000,0.000000,0.000000,0.000000
557635,44.500,W,rest,rest,mastercard,debit,315.0,yahoo.com,0.0,418.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
239625,43.970,W,rest,rest,mastercard,debit,rest,yahoo.com,420.0,420.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67429,64.013,C,rest,rest,mastercard,credit,rest,anonymous.com,0.0,0.0,...,211.576004,211.576004,211.576004,211.576004,64.013397,64.013397,64.013397,0.000000,0.000000,0.000000
30760,1337.450,W,rest,rest,visa,credit,299.0,yahoo.com,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1337.449951,1337.449951,1337.449951
472158,57.950,W,rest,rest,mastercard,debit,264.0,gmail.com,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
578662,59.000,W,rest,rest,visa,debit,rest,gmail.com,141.0,110.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [216]:
pipeline = imb_make_pipeline(SimpleImputer(strategy='median'),StandardScaler(),PCA(n_components=30),SMOTE(),RandomForestClassifier(max_depth=100,max_features=30))
pipeline.fit(features,label)
test_features = pd.get_dummies(test_x[rest_columns+c_columns +v_columns],columns=cat_var)
roc_auc_score(y_test,pipeline.predict(test_features))

0.7589712817659429

In [None]:
# RandomUnderSampler() 사용하지 않았을때 cross val score은 존나 높게 나온다, 실제 테스트는 구리다
# RandomUnderSampler() 사용했을때는 cross val score 높게 나오고 , 실제 테스트 82%정도 까지 나온다

In [183]:
features

Unnamed: 0,TransactionAmt,D1,D4,D10,D15,C1,C2,C3,C4,C5,...,addr1_123.0,addr1_204.0,addr1_264.0,addr1_272.0,addr1_299.0,addr1_315.0,addr1_325.0,addr1_330.0,addr1_441.0,addr1_rest
0,68.50,14.0,,13.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
1,29.00,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
2,59.00,0.0,0.0,0.0,315.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
3,50.00,112.0,94.0,84.0,111.0,2.0,5.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,50.00,0.0,,,,1.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,49.00,29.0,,56.0,56.0,2.0,1.0,0.0,0.0,1.0,...,0,0,0,1,0,0,0,0,0,0
590536,39.50,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
590537,30.95,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,1
590538,117.00,22.0,22.0,22.0,22.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


In [172]:
df_id_test = pd.read_csv('../ieee-fraud-detection/test_identity.csv')

In [55]:
df_transaction_test = pd.read_csv('../ieee-fraud-detection/test_transaction.csv')

In [161]:
test_x = df_transaction_test[origin_x.columns]
test_x['card1'] = test_x['card1'].apply(lambda x: 'rest' if x not in card1_top_10 else x)
test_x['card2'] = test_x['card2'].apply(lambda x: 'rest' if x not in card1_top_10 else x)
test_x['card4'] = test_x['card4'].apply(lambda x: 'rest' if x not in ['mastercard','discover','visa','american express'] else x )
test_x['card6'] = test_x['card6'].apply(lambda x:'rest' if x not in ['debit','credit'] else x)
test_x['P_emaildomain'] = test_x['P_emaildomain'].apply(lambda x: 'rest' if x not in domain_top_5 else x)
test_x['addr1'] = test_x['addr1'].apply(lambda x: 'rest' if x not in address1_top10 else x)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x['card1'] = test_x['card1'].apply(lambda x: 'rest' if x not in card1_top_10 else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x['card2'] = test_x['card2'].apply(lambda x: 'rest' if x not in card1_top_10 else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x['card4'] = test_x[

In [239]:
pipeline = imb_make_pipeline(SimpleImputer(strategy='median'),StandardScaler(),RandomUnderSampler(),RandomForestClassifier(n_jobs=-1))
pipeline.fit(features,label)
test_features = pd.get_dummies(test_x[rest_columns+c_columns +v_columns],columns=cat_var)
predict = pipeline.predict(test_features)
csv_df = pd.DataFrame({'TransactionID':df_transaction_test['TransactionID'],'isFraud':predict})
csv_df = csv_df.set_index('TransactionID')
csv_df.to_csv('predict.csv')

KeyError: "['M1', 'M2', 'M3', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11'] not in index"

In [154]:
df_transaction['addr1'].value_counts().iloc[0:9]

299.0    46335
325.0    42751
204.0    42020
264.0    39870
330.0    26287
315.0    23078
441.0    20827
272.0    20141
123.0    16105
Name: addr1, dtype: int64

In [109]:

df_transaction.drop(columns=many_null_cols).columns.to_list()

['TransactionID',
 'isFraud',
 'TransactionDT',
 'TransactionAmt',
 'ProductCD',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2',
 'P_emaildomain',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'D1',
 'D4',
 'D10',
 'D15',
 'M6',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V65',
 'V66',
 'V67',
 'V68',
 'V69',
 'V70',
 'V71',
 'V72',
 'V73',
 'V74',
 'V75',
 'V76',
 'V77',
 'V78',
 'V79',
 'V80',
 'V81',
 'V82',
 'V83',
 'V84',
 'V85',
 'V86',
 'V87',
 'V88',
 'V89',
 'V90',
 'V91',
 'V92',
 'V93',
 'V94',
 'V95',
 'V96',
 