In [None]:
import os
import gc
import numpy as np 
import pandas as pd 
print(os.listdir("../input"))

In [None]:
train_identity    = pd.read_csv("../input/train_identity.csv",    index_col='TransactionID')
train_transaction = pd.read_csv("../input/train_transaction.csv", index_col='TransactionID')
test_identity     = pd.read_csv("../input/test_identity.csv",     index_col='TransactionID')
test_transaction  = pd.read_csv('../input/test_transaction.csv',  index_col='TransactionID')

In [None]:
train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test  = test_transaction.merge( test_identity,  how='left', left_index=True, right_index=True)

In [None]:
del train_identity,train_transaction,test_identity, test_transaction
gc.collect()

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train = reduce_mem_usage(train)
test  = reduce_mem_usage(test)

In [None]:
# BU KISIMDA R ÜZERİNDEN MANİPÜLASYON VE PARAMETRE MÜHENDİSLİĞİ İŞLEMLERİ YAPILDI ARDINDAN PYTHON A GEÇİLDİ. YAKLAŞIK 2700 SATIR KOD MEVCUT R ÜZERİNDE.

In [None]:
def id_split(dataframe):
    dataframe['device_name'] = dataframe['DeviceInfo'].str.split('/', expand=True)[0]
    dataframe['device_version'] = dataframe['DeviceInfo'].str.split('/', expand=True)[1]

    dataframe['OS_id_30'] = dataframe['id_30'].str.split(' ', expand=True)[0]
    dataframe['version_id_30'] = dataframe['id_30'].str.split(' ', expand=True)[1]

    dataframe['browser_id_31'] = dataframe['id_31'].str.split(' ', expand=True)[0]
    dataframe['version_id_31'] = dataframe['id_31'].str.split(' ', expand=True)[1]

    dataframe['screen_width'] = dataframe['id_33'].str.split('x', expand=True)[0]
    dataframe['screen_height'] = dataframe['id_33'].str.split('x', expand=True)[1]

    dataframe.loc[dataframe['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    dataframe.loc[dataframe['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    dataframe.loc[dataframe['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    dataframe.loc[dataframe['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    dataframe.loc[dataframe['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    dataframe.loc[dataframe['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    dataframe.loc[dataframe.device_name.isin(dataframe.device_name.value_counts()[dataframe.device_name.value_counts() < 200].index), 'device_name'] = "Others"
    dataframe['had_id'] = 1
    gc.collect()
    
    return dataframe

In [None]:
train_1 = id_split(train)
del train

In [None]:
train_1['TransactionAmt_to_mean_card1'] = train_1['TransactionAmt'] / train_1.groupby(['card1'])['TransactionAmt'].transform('mean')
train_1['TransactionAmt_to_mean_card4'] = train_1['TransactionAmt'] / train_1.groupby(['card4'])['TransactionAmt'].transform('mean')
train_1['TransactionAmt_to_std_card1'] = train_1['TransactionAmt'] / train_1.groupby(['card1'])['TransactionAmt'].transform('std')
train_1['TransactionAmt_to_std_card4'] = train_1['TransactionAmt'] / train_1.groupby(['card4'])['TransactionAmt'].transform('std')
train_1['TransactionAmt_to_min_card4'] = train_1['TransactionAmt'] / train_1.groupby(['card4'])['TransactionAmt'].transform('min')
train_1['TransactionAmt_to_max_card1'] = train_1['TransactionAmt'] / train_1.groupby(['card1'])['TransactionAmt'].transform('max')

train_1['id_02_to_mean_card1'] = train_1['id_02'] / train_1.groupby(['card1'])['id_02'].transform('mean')
train_1['id_02_to_mean_card4'] = train_1['id_02'] / train_1.groupby(['card4'])['id_02'].transform('mean')
train_1['id_02_to_std_card1'] = train_1['id_02'] / train_1.groupby(['card1'])['id_02'].transform('std')
train_1['id_02_to_std_card4'] = train_1['id_02'] / train_1.groupby(['card4'])['id_02'].transform('std')


train_1['D15_to_mean_card1'] = train_1['D15'] / train_1.groupby(['card1'])['D15'].transform('mean')
train_1['D15_to_mean_card4'] = train_1['D15'] / train_1.groupby(['card4'])['D15'].transform('mean')
train_1['D15_to_std_card1'] = train_1['D15'] / train_1.groupby(['card1'])['D15'].transform('std')
train_1['D15_to_std_card4'] = train_1['D15'] / train_1.groupby(['card4'])['D15'].transform('std')

train_1['D15_to_mean_addr1'] = train_1['D15'] / train_1.groupby(['addr1'])['D15'].transform('mean')
train_1['D15_to_mean_card4'] = train_1['D15'] / train_1.groupby(['card4'])['D15'].transform('mean')
train_1['D15_to_std_addr1'] = train_1['D15'] / train_1.groupby(['addr1'])['D15'].transform('std')
train_1['D15_to_std_card4'] = train_1['D15'] / train_1.groupby(['card4'])['D15'].transform('std')

train_1['TransactionAmt_Log'] = np.log(train_1['TransactionAmt'])

train_1['TransactionAmt_decimal'] = ((train_1['TransactionAmt'] - train_1['TransactionAmt'].astype(int)) * 1000).astype(int)

train_1[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = train_1['P_emaildomain'].str.split('.', expand=True)

train_1['P_isproton']=(train_1['P_emaildomain']=='protonmail.com')

a = np.zeros(train_1.shape[0])
train_1["lastest_browser"] = a

def setbrowser(train_1):
    train_1.loc[train_1["id_31"]=="samsung browser 7.0",'lastest_browser']=1
    train_1.loc[train_1["id_31"]=="opera 53.0",'lastest_browser']=1
    train_1.loc[train_1["id_31"]=="mobile safari 10.0",'lastest_browser']=1
    train_1.loc[train_1["id_31"]=="google search application 49.0",'lastest_browser']=1
    train_1.loc[train_1["id_31"]=="firefox 60.0",'lastest_browser']=1
    train_1.loc[train_1["id_31"]=="edge 17.0",'lastest_browser']=1
    train_1.loc[train_1["id_31"]=="chrome 69.0",'lastest_browser']=1
    train_1.loc[train_1["id_31"]=="chrome 67.0 for android",'lastest_browser']=1
    train_1.loc[train_1["id_31"]=="chrome 63.0 for android",'lastest_browser']=1
    train_1.loc[train_1["id_31"]=="chrome 63.0 for ios",'lastest_browser']=1
    train_1.loc[train_1["id_31"]=="chrome 64.0",'lastest_browser']=1
    train_1.loc[train_1["id_31"]=="chrome 64.0 for android",'lastest_browser']=1
    train_1.loc[train_1["id_31"]=="chrome 64.0 for ios",'lastest_browser']=1
    train_1.loc[train_1["id_31"]=="chrome 65.0",'lastest_browser']=1
    train_1.loc[train_1["id_31"]=="chrome 65.0 for android",'lastest_browser']=1
    train_1.loc[train_1["id_31"]=="chrome 65.0 for ios",'lastest_browser']=1
    train_1.loc[train_1["id_31"]=="chrome 66.0",'lastest_browser']=1
    train_1.loc[train_1["id_31"]=="chrome 66.0 for android",'lastest_browser']=1
    train_1.loc[train_1["id_31"]=="chrome 66.0 for ios",'lastest_browser']=1
    return train_1

train_1 = setbrowser(train_1)

train_1['cardnumber_mean_last'] = train_1['TransactionAmt'] - train_1.groupby('cardnumber')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).mean())
train_1['cardnumber_min_last'] = train_1.groupby('cardnumber')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).min())
train_1['cardnumber_max_last'] = train_1.groupby('cardnumber')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).max())
train_1['cardnumber_std_last'] = train_1['cardnumber_min_last'] / train_1.groupby('cardnumber')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).std())
train_1['cardnumber_count_last'] = train_1.groupby('cardnumber')['TransactionAmt'].transform(lambda x: x.rolling(30, 1).count())

train_1['cardnumber_mean_last'].fillna(0, inplace=True, )
train_1['cardnumber_min_last'].fillna(0, inplace=True, )
train_1['cardnumber_max_last'].fillna(0, inplace=True, )
train_1['cardnumber_std_last'].fillna(0, inplace=True, )
train_1['cardnumber_count_last'].fillna(0, inplace=True, )


train_1['addr1_1'] = train_1['addr1'].fillna(0)
train_1['addr2_2'] = train_1['addr2'].fillna(0)

train_1['diff_adrr'] = train_1.addr1_1 - train_1.addr2_2
train_1['diff_adrr_plus'] = train_1.addr1_1 + train_1.addr2_2

train_1['first_value_addr1'] = train_1['addr1_1'].astype(str).str[0:1].astype(float)
train_1['two_value_addr1'] = train_1['addr1_1'].astype(str).str[0:2].astype(float)

train_1['Trans_min_mean'] = train_1['TransactionAmt'] - train_1['TransactionAmt'].mean()
train_1['Trans_min_max'] = train_1['TransactionAmt'] - train_1['TransactionAmt'].max()

train_1['Trans_std_min'] = train_1['TransactionAmt'] - train_1['TransactionAmt'].std()
train_1['Trans_min_max_2'] = train_1['Trans_min_mean'] / train_1['TransactionAmt']

train_1['Trans_min_max'] = train_1['Trans_min_mean'] / train_1['TransactionAmt'].std()
train_1['Trans_min_std'] = train_1['Trans_min_mean'] / train_1['TransactionAmt'].std()

train_1['Trans_min_mean'] = train_1['TransactionAmt'] - train_1['TransactionAmt'].mean()
train_1['Trans_min_std'] = train_1['Trans_min_mean'] / train_1['TransactionAmt'].std()

train_1['TransactionAmt_to_mean_card_id'] = train_1['TransactionAmt'] - train_1.groupby(['cardnumber'])['TransactionAmt'].transform('mean')
train_1['TransactionAmt_to_std_card_id'] = train_1['TransactionAmt_to_mean_card_id'] / train_1.groupby(['cardnumber'])['TransactionAmt'].transform('std')

train_1['TransactionAmt_to_max_card_id'] = train_1['TransactionAmt'] - train_1.groupby(['cardnumber'])['TransactionAmt'].transform('max')
train_1['TransactionAmt_to_min_card_id'] = train_1['TransactionAmt'] - train_1.groupby(['cardnumber'])['TransactionAmt'].transform('min')

In [None]:
columns=['TransactionAmt','TransactionFreqDaily', 'TransactionFreqHour', 'ProductAmtRatio', 'Limit']
obj_cols=['P_emaildomain','ProductCD','cardnumber', 'month','day','week','minute','hour', 'ZipCountry','UniqCountry']

for col in columns:
    for feat in obj_cols:
        train_1[f'{col}_mean_group_{feat}']=train_1[col]/train_1.groupby(feat)[col].transform('mean')
        train_1[f'{col}_max_group_{feat}']=train_1[col]/train_1.groupby(feat)[col].transform('max')
        train_1[f'{col}_min_group_{feat}']=train_1[col]/train_1.groupby(feat)[col].transform('min')
        train_1[f'{col}_skew_group_{feat}']=train_1[col]/train_1.groupby(feat)[col].transform('skew')
        train_1[f'{col}_skew_group_{feat}']=train_1[col]/train_1.groupby(feat)[col].transform('count')

In [None]:
# many_null_cols = [col for col in train_1.columns if train_1[col].isnull().sum() / train_1.shape[0] > 0.30]
# many_null_cols

In [None]:
test_columns_names = ['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'P_emaildomain', 'C1', 'C2', 'C4', 'C5', 'C9', 'C13', 'C14', 'D1', 'D2', 'D4', 'D10', 'D15', 'M2', 'M3', 'M4', 'M5', 'M6', 'V12', 'V13', 'V14', 'V15', 'V18', 'V19', 'V20', 'V21', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V30',
 'V32', 'V33', 'V35', 'V37', 'V38', 'V39', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V49', 'V50', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V59', 'V61', 'V62', 'V64', 'V65', 'V66', 'V67', 'V68', 'V70', 'V71', 'V74',
 'V75', 'V76', 'V77', 'V78', 'V79', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V91', 'V93', 'V94', 'V96', 'V98', 'V99', 'V100', 'V101', 'V103', 'V104', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113',
 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V127', 'V129', 'V130', 'V131', 'V132', 'V135', 'V136', 'V137', 'V279', 'V280', 'V281', 'V282', 'V283', 'V285', 'V286', 'V287', 'V288', 'V289', 'V290', 'V291',
 'V294', 'V296', 'V297', 'V299', 'V300', 'V301', 'V303', 'V304', 'V305', 'V306', 'V307', 'V308', 'V309', 'V310', 'V311', 'V312', 'V313', 'V314', 'V315', 'V319', 'V320', 'V321', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 'id_10', 'id_11',
 'id_12', 'id_13', 'id_14', 'id_15', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'date', 'time', 'quarter', 'year', 'month', 'day',
 'week', 'weekday', 'weekend', 'time2', 'hour', 'minute', 'second', 'am_pm', 'cardnumber', 'P_company', 'Country', 'Dot1', 'Dot', 'UniqEmail', 'UniqEmailProb', 'UniqEmailCat', 'Csum', 'Csd', 'Cmean', 'Ckurtosis', 'Cskewness', 'Vsum_g1', 'Vsd_g1', 'Vmean_g1', 'Vkurtosis_g1', 'Vskewnes_g1',
 'Vsum_g2', 'Vsd_g2', 'Vmean_g2', 'Vkurtosis_g2', 'Vskewnes_g2', 'TrVersion', 'OpSystem', 'TransactionFreqDaily', 'TransactionFreqHour', 'TransactionFreqAMPM', 'TransactionFreqMonthly', 'ZipCountry',
 'UniqCountry', 'CountryProb', 'UniqCountryCat', 'Limit', 'ProductAmtRatio', 'device_name', 'device_version', 'screen_width', 'screen_height', 'had_id', 'TransactionAmt_to_mean_card1', 'TransactionAmt_to_mean_card4', 'TransactionAmt_to_std_card1',
 'TransactionAmt_to_std_card4', 'TransactionAmt_to_min_card4', 'TransactionAmt_to_max_card1', 'id_02_to_mean_card1', 'id_02_to_mean_card4', 'id_02_to_std_card1', 'id_02_to_std_card4', 'D15_to_mean_card1', 'D15_to_mean_card4', 'D15_to_std_card1', 'D15_to_std_card4', 'D15_to_mean_addr1',
 'D15_to_std_addr1', 'TransactionAmt_Log', 'TransactionAmt_decimal', 'P_emaildomain_1', 'P_emaildomain_2', 'P_isproton', 'lastest_browser', 'cardnumber_mean_last', 'cardnumber_min_last', 'cardnumber_max_last', 'cardnumber_std_last', 'cardnumber_count_last', 'addr1_1', 'addr2_2', 'diff_adrr', 'diff_adrr_plus', 'first_value_addr1', 'two_value_addr1',
 'Trans_min_mean', 'Trans_min_max', 'Trans_std_min', 'Trans_min_max_2', 'Trans_min_std', 'TransactionAmt_to_mean_card_id', 'TransactionAmt_to_std_card_id', 'TransactionAmt_to_max_card_id', 'TransactionAmt_to_min_card_id', 'TransactionAmt_mean_group_P_emaildomain', 'TransactionAmt_max_group_P_emaildomain', 'TransactionAmt_min_group_P_emaildomain',
 'TransactionAmt_skew_group_P_emaildomain', 'TransactionAmt_mean_group_ProductCD', 'TransactionAmt_max_group_ProductCD', 'TransactionAmt_min_group_ProductCD', 'TransactionAmt_skew_group_ProductCD', 'TransactionAmt_mean_group_cardnumber', 'TransactionAmt_max_group_cardnumber', 'TransactionAmt_min_group_cardnumber',
 'TransactionAmt_skew_group_cardnumber', 'TransactionAmt_mean_group_month', 'TransactionAmt_max_group_month', 'TransactionAmt_min_group_month', 'TransactionAmt_skew_group_month', 'TransactionAmt_mean_group_day', 'TransactionAmt_max_group_day', 'TransactionAmt_min_group_day',
 'TransactionAmt_skew_group_day', 'TransactionAmt_mean_group_week', 'TransactionAmt_max_group_week', 'TransactionAmt_min_group_week', 'TransactionAmt_skew_group_week', 'TransactionAmt_mean_group_minute', 'TransactionAmt_max_group_minute', 'TransactionAmt_min_group_minute',
 'TransactionAmt_skew_group_minute', 'TransactionAmt_mean_group_hour', 'TransactionAmt_max_group_hour', 'TransactionAmt_min_group_hour', 'TransactionAmt_skew_group_hour', 'TransactionAmt_mean_group_ZipCountry', 'TransactionAmt_max_group_ZipCountry', 'TransactionAmt_min_group_ZipCountry',
 'TransactionAmt_skew_group_ZipCountry', 'TransactionAmt_mean_group_UniqCountry', 'TransactionAmt_max_group_UniqCountry', 'TransactionAmt_min_group_UniqCountry', 'TransactionAmt_skew_group_UniqCountry', 'TransactionFreqDaily_mean_group_P_emaildomain', 'TransactionFreqDaily_max_group_P_emaildomain', 'TransactionFreqDaily_min_group_P_emaildomain',
 'TransactionFreqDaily_skew_group_P_emaildomain', 'TransactionFreqDaily_mean_group_ProductCD', 'TransactionFreqDaily_max_group_ProductCD', 'TransactionFreqDaily_min_group_ProductCD', 'TransactionFreqDaily_skew_group_ProductCD', 'TransactionFreqDaily_mean_group_cardnumber', 'TransactionFreqDaily_max_group_cardnumber', 'TransactionFreqDaily_min_group_cardnumber',
 'TransactionFreqDaily_skew_group_cardnumber', 'TransactionFreqDaily_mean_group_month', 'TransactionFreqDaily_max_group_month', 'TransactionFreqDaily_min_group_month', 'TransactionFreqDaily_skew_group_month', 'TransactionFreqDaily_mean_group_day', 'TransactionFreqDaily_max_group_day', 'TransactionFreqDaily_min_group_day',
 'TransactionFreqDaily_skew_group_day', 'TransactionFreqDaily_mean_group_week', 'TransactionFreqDaily_max_group_week', 'TransactionFreqDaily_min_group_week', 'TransactionFreqDaily_skew_group_week', 'TransactionFreqDaily_mean_group_minute', 'TransactionFreqDaily_max_group_minute', 'TransactionFreqDaily_min_group_minute',
 'TransactionFreqDaily_skew_group_minute', 'TransactionFreqDaily_mean_group_hour', 'TransactionFreqDaily_max_group_hour', 'TransactionFreqDaily_min_group_hour', 'TransactionFreqDaily_skew_group_hour', 'TransactionFreqDaily_mean_group_ZipCountry', 'TransactionFreqDaily_max_group_ZipCountry', 'TransactionFreqDaily_min_group_ZipCountry', 'TransactionFreqDaily_skew_group_ZipCountry',
 'TransactionFreqDaily_mean_group_UniqCountry', 'TransactionFreqDaily_max_group_UniqCountry', 'TransactionFreqDaily_min_group_UniqCountry', 'TransactionFreqDaily_skew_group_UniqCountry',
 'TransactionFreqHour_mean_group_P_emaildomain', 'TransactionFreqHour_max_group_P_emaildomain', 'TransactionFreqHour_min_group_P_emaildomain', 'TransactionFreqHour_skew_group_P_emaildomain',
 'TransactionFreqHour_mean_group_ProductCD', 'TransactionFreqHour_max_group_ProductCD', 'TransactionFreqHour_min_group_ProductCD', 'TransactionFreqHour_skew_group_ProductCD', 'TransactionFreqHour_mean_group_cardnumber', 'TransactionFreqHour_max_group_cardnumber', 'TransactionFreqHour_min_group_cardnumber', 'TransactionFreqHour_skew_group_cardnumber',
 'TransactionFreqHour_mean_group_month', 'TransactionFreqHour_max_group_month', 'TransactionFreqHour_min_group_month', 'TransactionFreqHour_skew_group_month', 'TransactionFreqHour_mean_group_day', 'TransactionFreqHour_max_group_day', 'TransactionFreqHour_min_group_day', 'TransactionFreqHour_skew_group_day',
 'TransactionFreqHour_mean_group_week', 'TransactionFreqHour_max_group_week', 'TransactionFreqHour_min_group_week', 'TransactionFreqHour_skew_group_week', 'TransactionFreqHour_mean_group_minute', 'TransactionFreqHour_max_group_minute', 'TransactionFreqHour_min_group_minute', 'TransactionFreqHour_skew_group_minute',
 'TransactionFreqHour_mean_group_hour', 'TransactionFreqHour_max_group_hour', 'TransactionFreqHour_min_group_hour', 'TransactionFreqHour_skew_group_hour', 'TransactionFreqHour_mean_group_ZipCountry', 'TransactionFreqHour_max_group_ZipCountry', 'TransactionFreqHour_min_group_ZipCountry', 'TransactionFreqHour_skew_group_ZipCountry',
 'TransactionFreqHour_mean_group_UniqCountry', 'TransactionFreqHour_max_group_UniqCountry', 'TransactionFreqHour_min_group_UniqCountry', 'TransactionFreqHour_skew_group_UniqCountry', 'ProductAmtRatio_mean_group_P_emaildomain', 'ProductAmtRatio_max_group_P_emaildomain', 'ProductAmtRatio_min_group_P_emaildomain', 'ProductAmtRatio_skew_group_P_emaildomain',
 'ProductAmtRatio_mean_group_ProductCD', 'ProductAmtRatio_max_group_ProductCD', 'ProductAmtRatio_min_group_ProductCD', 'ProductAmtRatio_skew_group_ProductCD', 'ProductAmtRatio_mean_group_cardnumber', 'ProductAmtRatio_max_group_cardnumber', 'ProductAmtRatio_min_group_cardnumber', 'ProductAmtRatio_skew_group_cardnumber',
 'ProductAmtRatio_mean_group_month', 'ProductAmtRatio_max_group_month', 'ProductAmtRatio_min_group_month', 'ProductAmtRatio_skew_group_month', 'ProductAmtRatio_mean_group_day', 'ProductAmtRatio_max_group_day',
 'ProductAmtRatio_min_group_day', 'ProductAmtRatio_skew_group_day', 'ProductAmtRatio_mean_group_week', 'ProductAmtRatio_max_group_week', 'ProductAmtRatio_min_group_week', 'ProductAmtRatio_skew_group_week', 'ProductAmtRatio_mean_group_minute', 'ProductAmtRatio_max_group_minute',
 'ProductAmtRatio_min_group_minute', 'ProductAmtRatio_skew_group_minute', 'ProductAmtRatio_mean_group_hour', 'ProductAmtRatio_max_group_hour', 'ProductAmtRatio_min_group_hour', 'ProductAmtRatio_skew_group_hour', 'ProductAmtRatio_mean_group_ZipCountry', 'ProductAmtRatio_max_group_ZipCountry',
 'ProductAmtRatio_min_group_ZipCountry', 'ProductAmtRatio_skew_group_ZipCountry', 'ProductAmtRatio_mean_group_UniqCountry', 'ProductAmtRatio_max_group_UniqCountry',
 'ProductAmtRatio_min_group_UniqCountry', 'ProductAmtRatio_skew_group_UniqCountry', 'Limit_mean_group_P_emaildomain', 'Limit_max_group_P_emaildomain', 'Limit_min_group_P_emaildomain', 'Limit_skew_group_P_emaildomain',
 'Limit_mean_group_ProductCD', 'Limit_max_group_ProductCD', 'Limit_min_group_ProductCD', 'Limit_skew_group_ProductCD', 'Limit_mean_group_cardnumber', 'Limit_max_group_cardnumber', 'Limit_min_group_cardnumber', 'Limit_skew_group_cardnumber',
 'Limit_mean_group_month', 'Limit_max_group_month', 'Limit_min_group_month', 'Limit_skew_group_month', 'Limit_mean_group_day', 'Limit_max_group_day', 'Limit_min_group_day', 'Limit_skew_group_day', 'Limit_mean_group_week', 'Limit_max_group_week',
 'Limit_min_group_week', 'Limit_skew_group_week', 'Limit_mean_group_minute', 'Limit_max_group_minute', 'Limit_min_group_minute', 'Limit_skew_group_minute', 'Limit_mean_group_hour', 'Limit_max_group_hour',
 'Limit_min_group_hour', 'Limit_skew_group_hour', 'Limit_mean_group_ZipCountry', 'Limit_max_group_ZipCountry', 'Limit_min_group_ZipCountry', 'Limit_skew_group_ZipCountry', 'Limit_mean_group_UniqCountry', 'Limit_max_group_UniqCountry',
 'Limit_min_group_UniqCountry', 'Limit_skew_group_UniqCountry','isFraud']

In [None]:
train_1 = train_1[train_1.columns.intersection(test_columns_names)]

In [None]:
def get_redundant_pairs(df):
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations !")
print(get_top_abs_correlations(train_1.select_dtypes(include=['int32','int64']), 20))

In [None]:
train_1.to_csv('train_imputasyon_yok.csv', index=False)
# AYNI İŞLEMLER TEST E UYGULANDI

In [None]:
train = pd.read_csv('../train_imputasyon_yok.csv')
test = pd.read_csv('../test_imputasyon_yok.csv')

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


In [None]:
train = reduce_mem_usage(train)
test  = reduce_mem_usage(test)

In [None]:
tekil_parametre_sayisi = []

for c in train.columns:
    tekil_parametre_sayisi.append(train[c].nunique())

In [None]:
numerical  = [c for c in train.columns if train[c].nunique() > 61]
categorical  = [c for c in train.columns if train[c].nunique() <= 61]

from_numeric_to_categoric = ('cardnumber', 'device_version', "id_30", "id_31", "id_33",  'id_02', 'id_05', 'id_06', 'id_07', 'id_08', 'id_10', 'id_11', 'id_17', 'id_19', 'id_20', 'id_21', 'id_25', 'id_26', 'id_30','id_31', 'id_33', 'date', 'time2', 'time', 'ZipCountry')

from_categoric_to_numeric = ('Limit_min_group_P_emaildomain', 'Limit_max_group_ProductCD', 'Limit_min_group_ProductCD', 'Limit_mean_group_cardnumber', 'Limit_max_group_cardnumber', 'Limit_min_group_cardnumber', 'Limit_max_group_month', 'Limit_min_group_month', 'Limit_max_group_day', 'Limit_min_group_day', 'Limit_max_group_week', 'Limit_min_group_week', 'Limit_max_group_minute', 'Limit_min_group_minute', 'Limit_max_group_hour', 'Limit_min_group_hour','Limit_max_group_UniqCountry')

numerical = [e for e in numerical if e not in from_numeric_to_categoric]

categorical = [e for e in categorical if e not in from_categoric_to_numeric]

for col in from_numeric_to_categoric:
    categorical.append(col)
    
for col1 in from_categoric_to_numeric:
    numerical.append(col1)
    

categorical.remove("isFraud")

In [None]:
categorical_1 = categorical[0:175]
categorical_2 = categorical[182:186]

In [None]:
for col in train[categorical_1].columns:
    train[col] = train[col].fillna('mis')
    test[col]  = test[col].fillna('mis')
        
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)
        
    le = LabelEncoder()
    le.fit(list(train[col])+list(test[col]))
    train[col] = le.transform(train[col])
    test[col]  = le.transform(test[col])
        
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

In [None]:
for col in train[categorical_2].columns:
    train[col] = train[col].fillna('mis')
    test[col]  = test[col].fillna('mis')
        
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)
        
    le = LabelEncoder()
    le.fit(list(train[col])+list(test[col]))
    train[col] = le.transform(train[col])
    test[col]  = le.transform(test[col])
        
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

In [None]:
train.drop(['date', 'time2', 'time', 'ZipCountry',"id_20", "id_26", "id_21", "id_25"], axis = 1, inplace = True)
test.drop(['date', 'time2', 'time', 'ZipCountry',"id_20", "id_26", "id_21", "id_25"], axis = 1, inplace = True)

train["isFraud"] = train["isFraud"].astype("category")

In [None]:
train.to_csv('train_2_09_2019.csv', index=False)
test.to_csv('test_2_09_2019.csv', index=False)

In [None]:
test = pd.read_csv("../test_2_09_2019.csv")
train = pd.read_csv("../train_2_09_2019.csv")

In [None]:
kategorik = ['ProductCD', 'card4','card6', 'P_emaildomain', 'M2', 'M3', 'M4', 'M5', 'M6', 'V12', 'V13', 'V14', 'V15', 'V18', 'V19', 'V20', 'V21', 'V23', 'V24', 'V25', 'V26', 'V27', 
             'V28', 'V30', 'V32', 'V33', 'V35', 'V37', 'V38', 'V39', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V49', 'V50', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V59', 
             'V61', 'V62', 'V64', 'V65', 'V66', 'V67', 'V68', 'V70', 'V71', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 
             'V91', 'V93', 'V94', 'V98', 'V100', 'V104', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 
             'V122', 'V123', 'V124', 'V125', 'V281', 'V282', 'V286', 'V287', 'V288', 'V289', 'V290', 'V297', 'V299', 'V300', 'V301', 'V303', 'V304','V305', 'id_03', 'id_04', 'id_09', 
             'id_12', 'id_13', 'id_14', 'id_15', 'id_18', 'id_22', 'id_23', 'id_24', 'id_28', 'id_29', 'id_32', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'quarter', 
             'year', 'month', 'day', 'week', 'weekday', 'weekend', 'hour', 'minute', 'second', 'am_pm', 'P_company', 'Country', 'Dot1', 'Dot', 'UniqEmail', 'UniqEmailProb', 'UniqEmailCat', 
             'TrVersion', 'OpSystem', 'UniqCountryCat', 'Limit', 'device_name', 'had_id', 'P_emaildomain_1', 'P_emaildomain_2', 'P_isproton', 'lastest_browser', 'cardnumber_count_last', 
             'first_value_addr1', 'two_value_addr1', 'cardnumber', 'device_version', 'id_30', 'id_31', 'id_33', 'id_02', 'id_05', 'id_06', 'id_07', 'id_08', 'id_10', 'id_11', 'id_17', 
             'id_19', 'id_30', 'id_31', 'id_33']

numerik = ['TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2', 'dist1', 'C1', 'C2', 'C4', 'C5', 'C9', 'C13', 'C14', 'D1', 'D2', 'D4', 'D10', 'D15', 'V96', 'V99', 'V101', 
           'V103', 'V127', 'V129', 'V130', 'V131', 'V132', 'V135', 'V136', 'V137', 'V279', 'V280', 'V283', 'V285', 'V291', 'V294', 'V296', 'V306', 'V307', 'V308', 'V309', 'V310', 'V311', 
           'V312', 'V313', 'V314', 'V315', 'V319', 'V320', 'V321', 'Csum', 'Csd', 'Cmean', 'Ckurtosis', 'Cskewness', 'Vsum_g1', 'Vsd_g1', 'Vmean_g1', 'Vkurtosis_g1', 'Vskewnes_g1', 'Vsum_g2', 
           'Vsd_g2', 'Vmean_g2', 'Vkurtosis_g2', 'Vskewnes_g2', 'TransactionFreqDaily', 'TransactionFreqHour', 'TransactionFreqAMPM', 'TransactionFreqMonthly', 'UniqCountry', 'CountryProb', 
           'ProductAmtRatio', 'screen_width', 'screen_height', 'TransactionAmt_to_mean_card1', 'TransactionAmt_to_mean_card4', 'TransactionAmt_to_std_card1', 'TransactionAmt_to_std_card4', 
           'TransactionAmt_to_min_card4', 'TransactionAmt_to_max_card1', 'id_02_to_mean_card1', 'id_02_to_mean_card4', 'id_02_to_std_card1', 'id_02_to_std_card4', 'D15_to_mean_card1', 
           'D15_to_mean_card4', 'D15_to_std_card1', 'D15_to_std_card4', 'D15_to_mean_addr1', 'D15_to_std_addr1', 'TransactionAmt_Log', 'TransactionAmt_decimal', 'cardnumber_mean_last', 
           'cardnumber_min_last', 'cardnumber_max_last', 'cardnumber_std_last', 'addr1_1', 'addr2_2', 'diff_adrr', 'diff_adrr_plus', 'Trans_min_mean', 'Trans_min_max', 'Trans_std_min', 
           'Trans_min_max_2', 'Trans_min_std', 'TransactionAmt_to_mean_card_id', 'TransactionAmt_to_std_card_id', 'TransactionAmt_to_max_card_id', 'TransactionAmt_to_min_card_id', 
           'TransactionAmt_mean_group_P_emaildomain', 'TransactionAmt_max_group_P_emaildomain', 'TransactionAmt_min_group_P_emaildomain', 'TransactionAmt_skew_group_P_emaildomain', 
           'TransactionAmt_mean_group_ProductCD', 'TransactionAmt_max_group_ProductCD', 'TransactionAmt_min_group_ProductCD', 'TransactionAmt_skew_group_ProductCD', 'TransactionAmt_mean_group_cardnumber', 
           'TransactionAmt_max_group_cardnumber', 'TransactionAmt_min_group_cardnumber', 'TransactionAmt_skew_group_cardnumber', 'TransactionAmt_mean_group_month', 'TransactionAmt_max_group_month', 
           'TransactionAmt_min_group_month', 'TransactionAmt_skew_group_month', 'TransactionAmt_mean_group_day', 'TransactionAmt_max_group_day', 'TransactionAmt_min_group_day', 'TransactionAmt_skew_group_day', 
           'TransactionAmt_mean_group_week', 'TransactionAmt_max_group_week', 'TransactionAmt_min_group_week', 'TransactionAmt_skew_group_week', 'TransactionAmt_mean_group_minute', 'TransactionAmt_max_group_minute', 
           'TransactionAmt_min_group_minute', 'TransactionAmt_skew_group_minute', 'TransactionAmt_mean_group_hour', 'TransactionAmt_max_group_hour', 'TransactionAmt_min_group_hour', 'TransactionAmt_skew_group_hour',
           'TransactionAmt_mean_group_ZipCountry', 'TransactionAmt_max_group_ZipCountry', 'TransactionAmt_min_group_ZipCountry', 'TransactionAmt_skew_group_ZipCountry', 'TransactionAmt_mean_group_UniqCountry', 
           'TransactionAmt_max_group_UniqCountry', 'TransactionAmt_min_group_UniqCountry', 'TransactionAmt_skew_group_UniqCountry', 'TransactionFreqDaily_mean_group_P_emaildomain', 
           'TransactionFreqDaily_max_group_P_emaildomain', 'TransactionFreqDaily_min_group_P_emaildomain', 'TransactionFreqDaily_skew_group_P_emaildomain', 'TransactionFreqDaily_mean_group_ProductCD',
           'TransactionFreqDaily_max_group_ProductCD', 'TransactionFreqDaily_min_group_ProductCD', 'TransactionFreqDaily_skew_group_ProductCD', 'TransactionFreqDaily_mean_group_cardnumber', 
           'TransactionFreqDaily_max_group_cardnumber', 'TransactionFreqDaily_min_group_cardnumber', 'TransactionFreqDaily_skew_group_cardnumber', 'TransactionFreqDaily_mean_group_month', 
           'TransactionFreqDaily_max_group_month', 'TransactionFreqDaily_min_group_month', 'TransactionFreqDaily_skew_group_month', 'TransactionFreqDaily_mean_group_day', 'TransactionFreqDaily_max_group_day',
           'TransactionFreqDaily_min_group_day', 'TransactionFreqDaily_skew_group_day', 'TransactionFreqDaily_mean_group_week', 'TransactionFreqDaily_max_group_week', 'TransactionFreqDaily_min_group_week', 
           'TransactionFreqDaily_skew_group_week', 'TransactionFreqDaily_mean_group_minute', 'TransactionFreqDaily_max_group_minute', 'TransactionFreqDaily_min_group_minute', 'TransactionFreqDaily_skew_group_minute', 
           'TransactionFreqDaily_mean_group_hour', 'TransactionFreqDaily_max_group_hour', 'TransactionFreqDaily_min_group_hour', 'TransactionFreqDaily_skew_group_hour', 'TransactionFreqDaily_mean_group_ZipCountry',
           'TransactionFreqDaily_max_group_ZipCountry', 'TransactionFreqDaily_min_group_ZipCountry', 'TransactionFreqDaily_skew_group_ZipCountry', 'TransactionFreqDaily_mean_group_UniqCountry', 
           'TransactionFreqDaily_max_group_UniqCountry', 'TransactionFreqDaily_min_group_UniqCountry', 'TransactionFreqDaily_skew_group_UniqCountry', 'TransactionFreqHour_mean_group_P_emaildomain',
           'TransactionFreqHour_max_group_P_emaildomain', 'TransactionFreqHour_min_group_P_emaildomain', 'TransactionFreqHour_skew_group_P_emaildomain', 'TransactionFreqHour_mean_group_ProductCD', 
           'TransactionFreqHour_max_group_ProductCD', 'TransactionFreqHour_min_group_ProductCD', 'TransactionFreqHour_skew_group_ProductCD', 'TransactionFreqHour_mean_group_cardnumber', 
           'TransactionFreqHour_max_group_cardnumber', 'TransactionFreqHour_min_group_cardnumber', 'TransactionFreqHour_skew_group_cardnumber', 'TransactionFreqHour_mean_group_month', 
           'TransactionFreqHour_max_group_month', 'TransactionFreqHour_min_group_month', 'TransactionFreqHour_skew_group_month', 'TransactionFreqHour_mean_group_day', 'TransactionFreqHour_max_group_day', 
           'TransactionFreqHour_min_group_day', 'TransactionFreqHour_skew_group_day', 'TransactionFreqHour_mean_group_week', 'TransactionFreqHour_max_group_week', 'TransactionFreqHour_min_group_week', 
           'TransactionFreqHour_skew_group_week', 'TransactionFreqHour_mean_group_minute', 'TransactionFreqHour_max_group_minute', 'TransactionFreqHour_min_group_minute', 'TransactionFreqHour_skew_group_minute', 
           'TransactionFreqHour_mean_group_hour', 'TransactionFreqHour_max_group_hour', 'TransactionFreqHour_min_group_hour', 'TransactionFreqHour_skew_group_hour', 'TransactionFreqHour_mean_group_ZipCountry',
           'TransactionFreqHour_max_group_ZipCountry', 'TransactionFreqHour_min_group_ZipCountry', 'TransactionFreqHour_skew_group_ZipCountry', 'TransactionFreqHour_mean_group_UniqCountry', 
           'TransactionFreqHour_max_group_UniqCountry', 'TransactionFreqHour_min_group_UniqCountry', 'TransactionFreqHour_skew_group_UniqCountry', 'ProductAmtRatio_mean_group_P_emaildomain',
           'ProductAmtRatio_max_group_P_emaildomain', 'ProductAmtRatio_min_group_P_emaildomain', 'ProductAmtRatio_skew_group_P_emaildomain', 'ProductAmtRatio_mean_group_ProductCD', 'ProductAmtRatio_max_group_ProductCD',
           'ProductAmtRatio_min_group_ProductCD', 'ProductAmtRatio_skew_group_ProductCD', 'ProductAmtRatio_mean_group_cardnumber', 'ProductAmtRatio_max_group_cardnumber', 'ProductAmtRatio_min_group_cardnumber', 
           'ProductAmtRatio_skew_group_cardnumber', 'ProductAmtRatio_mean_group_month', 'ProductAmtRatio_max_group_month', 'ProductAmtRatio_min_group_month', 'ProductAmtRatio_skew_group_month', 
           'ProductAmtRatio_mean_group_day', 'ProductAmtRatio_max_group_day', 'ProductAmtRatio_min_group_day', 'ProductAmtRatio_skew_group_day', 'ProductAmtRatio_mean_group_week', 'ProductAmtRatio_max_group_week', 
           'ProductAmtRatio_min_group_week', 'ProductAmtRatio_skew_group_week', 'ProductAmtRatio_mean_group_minute', 'ProductAmtRatio_max_group_minute', 'ProductAmtRatio_min_group_minute', 'ProductAmtRatio_skew_group_minute', 
           'ProductAmtRatio_mean_group_hour', 'ProductAmtRatio_max_group_hour', 'ProductAmtRatio_min_group_hour', 'ProductAmtRatio_skew_group_hour', 'ProductAmtRatio_mean_group_ZipCountry', 'ProductAmtRatio_max_group_ZipCountry', 
           'ProductAmtRatio_min_group_ZipCountry', 'ProductAmtRatio_skew_group_ZipCountry', 'ProductAmtRatio_mean_group_UniqCountry', 'ProductAmtRatio_max_group_UniqCountry', 'ProductAmtRatio_min_group_UniqCountry', 
           'ProductAmtRatio_skew_group_UniqCountry', 'Limit_mean_group_P_emaildomain', 'Limit_max_group_P_emaildomain', 'Limit_skew_group_P_emaildomain', 'Limit_skew_group_ProductCD', 'Limit_skew_group_cardnumber', 
           'Limit_mean_group_month', 'Limit_skew_group_month', 'Limit_mean_group_day', 'Limit_skew_group_day', 'Limit_mean_group_week', 'Limit_skew_group_week', 'Limit_mean_group_minute', 'Limit_skew_group_minute',
           'Limit_mean_group_hour', 'Limit_skew_group_hour', 'Limit_mean_group_ZipCountry', 'Limit_max_group_ZipCountry', 'Limit_min_group_ZipCountry', 'Limit_skew_group_ZipCountry', 'Limit_mean_group_UniqCountry', 
           'Limit_min_group_UniqCountry', 'Limit_skew_group_UniqCountry', 'Limit_min_group_P_emaildomain', 'Limit_max_group_ProductCD', 'Limit_min_group_ProductCD', 'Limit_mean_group_cardnumber', 'Limit_max_group_cardnumber', 
           'Limit_min_group_cardnumber', 'Limit_max_group_month', 'Limit_min_group_month', 'Limit_max_group_day', 'Limit_min_group_day', 'Limit_max_group_week', 'Limit_min_group_week', 'Limit_max_group_minute',
           'Limit_min_group_minute', 'Limit_max_group_hour', 'Limit_min_group_hour', 'Limit_max_group_UniqCountry']

In [None]:
temp=train['ProductAmtRatio_min_group_P_emaildomain'].isnull()
train.ProductAmtRatio_min_group_P_emaildomain[temp]=train.ProductAmtRatio_min_group_hour[temp]

temp1=test['ProductAmtRatio_min_group_P_emaildomain'].isnull()
test.ProductAmtRatio_min_group_P_emaildomain[temp1]=test.ProductAmtRatio_min_group_hour[temp1]

temp=train['D15_to_std_card4'].isnull()
train.D15_to_std_card4[temp]=train.D15_to_mean_card4[temp]

temp1=test['D15_to_std_card4'].isnull()
test.D15_to_std_card4[temp1]=test.D15_to_mean_card4[temp1]

In [None]:
train[kategorik] = train[kategorik].astype("object")
test[kategorik] = test[kategorik].astype("object")

train[numerik] = train[numerik].astype("float32")
test[numerik] = test[numerik].astype("float32")


silinecekler = ["Limit_mean_group_P_emaildomain", "Limit_mean_group_week","TransactionAmt_mean_group_ProductCD", "V320","TransactionFreqDaily_skew_group_day","TransactionFreqHour_skew_group_day",
                "TransactionAmt_max_group_week","D2","ProductAmtRatio_skew_group_day","TransactionAmt_skew_group_day","Cmean", "TransactionAmt_to_mean_card1","TransactionAmt_to_min_card_id",
                "TransactionFreqHour_skew_group_cardnumber" ,"Limit_mean_group_minute","diff_adrr_plus","Vsum_g1","TransactionFreqHour_mean_group_hour","TransactionFreqDaily_mean_group_ProductCD",
                "TransactionFreqHour_mean_group_day","TransactionFreqHour_min_group_day","TransactionFreqHour_mean_group_day","TransactionFreqDaily_mean_group_month","V103","TransactionAmt_mean_group_ZipCountry",
                "TransactionAmt_to_std_card_id","V306","V127","TransactionAmt_to_std_card4","D15_to_mean_card4","ProductAmtRatio_mean_group_month","TransactionAmt_mean_group_UniqCountry","Vsd_g2", "C2",
                "TransactionFreqHour_min_group_ZipCountry","Limit_min_group_minute","D15_to_mean_addr1","C14","TransactionFreqHour_min_group_ZipCountry","TransactionAmt_mean_group_P_emaildomain",
                "diff_adrr", "Limit_mean_group_ProductCD", "TransactionAmt_mean_group_week","TransactionFreqDaily_max_group_minute","V101","TransactionAmt_to_min_card4",  "Limit_max_group_ZipCountry", 
                "TransactionFreqDaily_skew_group_minute", "V96", "V280", "TransactionFreqDaily_mean_group_minute", "ProductAmtRatio_mean_group_week", "Limit_skew_group_day", "TransactionFreqHour_mean_group_month",
                "TransactionAmt_mean_group_hour", "V308","TransactionFreqDaily_mean_group_P_emaildomain", "ProductAmtRatio_mean_group_hour", "id_02_to_mean_card4", "TransactionAmt_mean_group_minute", "V307", 
                "V132","TransactionFreqDaily_min_group_UniqCountry","TransactionFreqHour_min_group_cardnumber", "C1", "TransactionAmt_mean_group_day", "Limit_max_group_P_emaildomain", "TransactionAmt_to_mean_card4", 
                "D15", "Vsd_g1", "TransactionFreqHour_mean_group_P_emaildomain", "TransactionFreqHour_skew_group_minute", "Limit_max_group_day", "TransactionFreqDaily_min_group_cardnumber", "Vmean_g2",
                "Limit_mean_group_hour", "Limit_mean_group_month", "V279", "Limit_mean_group_day","CountryProb", "Trans_std_min", "Trans_min_max", "Trans_min_mean", "Limit_skew_group_minute", "TransactionAmt_mean_group_month",
                "TransactionAmt_skew_group_minute", "TransactionFreqHour_min_group_minute", "Csum", "Limit_max_group_month", "Limit_max_group_month", "TransactionFreqDaily_min_group_day", "Limit_min_group_month",
                "Limit_min_group_P_emaildomain", "TransactionFreqHour_min_group_month", "Limit_min_group_ProductCD", "TransactionFreqDaily_min_group_ZipCountry", "Limit_max_group_minute", 
                "TransactionFreqHour_min_group_P_emaildomain", "TransactionFreqHour_min_group_ProductCD", "Limit_max_group_ProductCD", "Trans_min_std","Vmean_g1", "Limit_max_group_UniqCountry", 
                "TransactionFreqDaily_min_group_P_emaildomain","Limit_min_group_week", "TransactionFreqDaily_min_group_week", "TransactionFreqHour_min_group_UniqCountry", "TransactionFreqHour",
                "TransactionFreqDaily_min_group_month", "TransactionFreqDaily", "Limit_max_group_week", "TransactionFreqDaily_min_group_ProductCD", "TransactionFreqDaily_min_group_hour","TransactionFreqHour_min_group_week", 
                "TransactionFreqDaily_min_group_minute", "Limit_min_group_hour", "addr1", "addr2", "Limit_max_group_hour", "Limit_min_group_day","TransactionFreqHour_min_group_hour"]

In [None]:
numerik = [e for e in numerik if e not in silinecekler]

In [None]:
train.drop(["Limit_mean_group_P_emaildomain","TransactionAmt_mean_group_ProductCD", "V320","TransactionFreqDaily_skew_group_day","TransactionFreqHour_skew_group_day","TransactionAmt_max_group_week","D2","ProductAmtRatio_skew_group_day","TransactionAmt_skew_group_day","Cmean", "TransactionAmt_to_mean_card1","TransactionAmt_to_min_card_id","TransactionFreqHour_skew_group_cardnumber" ,"Limit_mean_group_minute","diff_adrr_plus","Vsum_g1","TransactionFreqHour_mean_group_hour","TransactionFreqDaily_mean_group_ProductCD", "TransactionFreqHour_mean_group_day","TransactionFreqHour_min_group_day","TransactionFreqHour_mean_group_day","TransactionFreqDaily_mean_group_month","V103","TransactionAmt_mean_group_ZipCountry","TransactionAmt_to_std_card_id","V306","V127","TransactionAmt_to_std_card4","ProductAmtRatio","D15_to_mean_card4","ProductAmtRatio_mean_group_month","TransactionAmt_mean_group_UniqCountry","Vsd_g2", "C2", "TransactionFreqHour_min_group_ZipCountry","Limit_min_group_minute","D15_to_mean_addr1","C14","TransactionFreqHour_min_group_ZipCountry","TransactionAmt_mean_group_P_emaildomain","diff_adrr", "Limit_mean_group_ProductCD", "TransactionAmt_mean_group_week","TransactionFreqDaily_max_group_minute","V101","TransactionAmt_to_min_card4", "Limit_mean_group_week", "Limit_max_group_ZipCountry", "TransactionFreqDaily_skew_group_minute", "V96", "V280", "TransactionFreqDaily_mean_group_minute", "ProductAmtRatio_mean_group_week", "Limit_skew_group_day", "TransactionFreqHour_mean_group_month", "TransactionAmt_mean_group_hour", "V308", "ProductAmtRatio_mean_group_minute","TransactionFreqDaily_mean_group_P_emaildomain", "ProductAmtRatio_mean_group_hour", "id_02_to_mean_card4", "TransactionAmt_mean_group_minute", "V307", "V132","TransactionFreqDaily_min_group_UniqCountry","TransactionFreqHour_min_group_cardnumber", "C1", "TransactionAmt_mean_group_day", "Limit_max_group_P_emaildomain", "TransactionAmt_to_mean_card4", "D15", "Vsd_g1", "TransactionFreqHour_mean_group_P_emaildomain", "TransactionFreqHour_skew_group_minute", "Limit_max_group_day", "TransactionFreqDaily_min_group_cardnumber", "Vmean_g2", "Limit_mean_group_hour", "ProductAmtRatio_skew_group_minute", "Limit_mean_group_month", "V279", "Limit_mean_group_day","CountryProb", "Trans_std_min", "Trans_min_max", "Trans_min_mean", "Limit_skew_group_minute", "TransactionAmt_mean_group_month", "TransactionAmt_skew_group_minute", "TransactionFreqHour_min_group_minute", "Csum", "Limit_max_group_month", "Limit_max_group_month", "TransactionFreqDaily_min_group_day", "Limit_min_group_month", "Limit_min_group_P_emaildomain", "TransactionFreqHour_min_group_month", "Limit_min_group_ProductCD", "TransactionFreqDaily_min_group_ZipCountry", "Limit_max_group_minute", "TransactionFreqHour_min_group_P_emaildomain", "TransactionFreqHour_min_group_ProductCD", "Limit_max_group_ProductCD", "Trans_min_std","Vmean_g1", "Limit_max_group_UniqCountry", "TransactionFreqDaily_min_group_P_emaildomain","Limit_min_group_week", "TransactionFreqDaily_min_group_week", "TransactionFreqHour_min_group_UniqCountry", "TransactionFreqHour", "TransactionFreqDaily_min_group_month", "TransactionFreqDaily", "Limit_max_group_week", "TransactionFreqDaily_min_group_ProductCD", "TransactionFreqDaily_min_group_hour","TransactionFreqHour_min_group_week", "TransactionFreqDaily_min_group_minute", "Limit_min_group_hour", "addr1", "addr2", "Limit_max_group_hour", "Limit_min_group_day","TransactionFreqHour_min_group_hour"], axis = 1, inplace = True)
test.drop(["Limit_mean_group_P_emaildomain","TransactionAmt_mean_group_ProductCD", "V320","TransactionFreqDaily_skew_group_day","TransactionFreqHour_skew_group_day","TransactionAmt_max_group_week","D2","ProductAmtRatio_skew_group_day","TransactionAmt_skew_group_day","Cmean", "TransactionAmt_to_mean_card1","TransactionAmt_to_min_card_id","TransactionFreqHour_skew_group_cardnumber" ,"Limit_mean_group_minute","diff_adrr_plus","Vsum_g1","TransactionFreqHour_mean_group_hour","TransactionFreqDaily_mean_group_ProductCD", "TransactionFreqHour_mean_group_day","TransactionFreqHour_min_group_day","TransactionFreqHour_mean_group_day","TransactionFreqDaily_mean_group_month","V103","TransactionAmt_mean_group_ZipCountry","TransactionAmt_to_std_card_id","V306","V127","TransactionAmt_to_std_card4","ProductAmtRatio","D15_to_mean_card4","ProductAmtRatio_mean_group_month","TransactionAmt_mean_group_UniqCountry","Vsd_g2", "C2", "TransactionFreqHour_min_group_ZipCountry","Limit_min_group_minute","D15_to_mean_addr1","C14","TransactionFreqHour_min_group_ZipCountry","TransactionAmt_mean_group_P_emaildomain","diff_adrr", "Limit_mean_group_ProductCD", "TransactionAmt_mean_group_week","TransactionFreqDaily_max_group_minute","V101","TransactionAmt_to_min_card4", "Limit_mean_group_week", "Limit_max_group_ZipCountry", "TransactionFreqDaily_skew_group_minute", "V96", "V280", "TransactionFreqDaily_mean_group_minute", "ProductAmtRatio_mean_group_week", "Limit_skew_group_day", "TransactionFreqHour_mean_group_month", "TransactionAmt_mean_group_hour", "V308", "ProductAmtRatio_mean_group_minute","TransactionFreqDaily_mean_group_P_emaildomain", "ProductAmtRatio_mean_group_hour", "id_02_to_mean_card4", "TransactionAmt_mean_group_minute", "V307", "V132","TransactionFreqDaily_min_group_UniqCountry","TransactionFreqHour_min_group_cardnumber", "C1", "TransactionAmt_mean_group_day", "Limit_max_group_P_emaildomain", "TransactionAmt_to_mean_card4", "D15", "Vsd_g1", "TransactionFreqHour_mean_group_P_emaildomain", "TransactionFreqHour_skew_group_minute", "Limit_max_group_day", "TransactionFreqDaily_min_group_cardnumber", "Vmean_g2", "Limit_mean_group_hour", "ProductAmtRatio_skew_group_minute", "Limit_mean_group_month", "V279", "Limit_mean_group_day","CountryProb", "Trans_std_min", "Trans_min_max", "Trans_min_mean", "Limit_skew_group_minute", "TransactionAmt_mean_group_month", "TransactionAmt_skew_group_minute", "TransactionFreqHour_min_group_minute", "Csum", "Limit_max_group_month", "Limit_max_group_month", "TransactionFreqDaily_min_group_day", "Limit_min_group_month", "Limit_min_group_P_emaildomain", "TransactionFreqHour_min_group_month", "Limit_min_group_ProductCD", "TransactionFreqDaily_min_group_ZipCountry", "Limit_max_group_minute", "TransactionFreqHour_min_group_P_emaildomain", "TransactionFreqHour_min_group_ProductCD", "Limit_max_group_ProductCD", "Trans_min_std","Vmean_g1", "Limit_max_group_UniqCountry", "TransactionFreqDaily_min_group_P_emaildomain","Limit_min_group_week", "TransactionFreqDaily_min_group_week", "TransactionFreqHour_min_group_UniqCountry", "TransactionFreqHour", "TransactionFreqDaily_min_group_month", "TransactionFreqDaily", "Limit_max_group_week", "TransactionFreqDaily_min_group_ProductCD", "TransactionFreqDaily_min_group_hour","TransactionFreqHour_min_group_week", "TransactionFreqDaily_min_group_minute", "Limit_min_group_hour", "addr1", "addr2", "Limit_max_group_hour", "Limit_min_group_day","TransactionFreqHour_min_group_hour",'Unnamed: 0'], axis = 1, inplace = True)

In [None]:
median_doldur = ["V291","Limit_skew_group_P_emaildomain","ProductAmtRatio_skew_group_P_emaildomain","card5","ProductAmtRatio_max_group_P_emaildomain","TransactionFreqHour_skew_group_P_emaildomain","TransactionFreqHour_max_group_P_emaildomain","TransactionFreqDaily_skew_group_P_emaildomain","TransactionFreqDaily_max_group_P_emaildomain","TransactionAmt_skew_group_P_emaildomain","V129","card3","V131", "V135", "V136", "V137", "V294", "TransactionAmt_to_std_card1","V296", "id_02_to_std_card1","V309","V310","V311","V312","V313","V314","V315","V319","V321","D15_to_std_card1", "screen_width", "screen_height"]
mean_doldur = ["ProductAmtRatio_mean_group_P_emaildomain","D15_to_mean_card1", "TransactionAmt_min_group_P_emaildomain","id_02_to_std_card4","card2", "V99", "V130", "V285", "V296","id_02_to_mean_card1", "TransactionAmt_max_group_P_emaildomain"]

In [None]:
def nan_to_median(df):
    for x in list(median_doldur):
        df[x] = df[x].fillna(df[x].median())
    return df

def nan_to_mean(df):
    for x in list(mean_doldur):
        df[x] = df[x].fillna(df[x].mean())
    return df

train = nan_to_median(train)
test = nan_to_median(test)

train = nan_to_mean(train)
test = nan_to_mean(test)

train["D15_to_std_addr1"].fillna(0, inplace = True)
test["D15_to_std_addr1"].fillna(0, inplace = True) 


train["D10"].fillna(0, inplace = True)
test["D10"].fillna(0, inplace = True) 


temp=train['D4'].isnull()
train.D4[temp]=train.D10[temp]


temp1=test['D4'].isnull()
test.D4[temp1]=test.D10[temp1]


train["V283"].fillna(1.4, inplace = True)
test["V283"].fillna(1.4, inplace = True)


train["D15_to_std_card4"].fillna(0.6, inplace = True)
test["D15_to_std_card4"].fillna(0.6, inplace = True)


train["D1"].fillna(0, inplace = True)
test["D1"].fillna(0, inplace = True)


train["dist1"].fillna(8, inplace = True)
test["dist1"].fillna(8, inplace = True)

train["V39_V52_cor"] = train["V39"] + train["V52"]
test["V39_V52_cor"] = test["V39"] + test["V52"]

train["V44_V86_V87_cor"] = train["V44"] + train["V86"] + train["V87"]
test["V44_V86_V87_cor"] = test["V44"] + test["V86"] + test["V87"]

train["V45_V86_V87_cor"] = train["V45"] + train["V86"] + train["V87"]
test["V45_V86_V87_cor"] = test["V45"] + test["V86"] + test["V87"]

train["addr1_1_addr2_2"] = train["addr1_1"] + train["addr2_2"]
test["addr1_1_addr2_2"] = test["addr1_1"] + test["addr2_2"]

train["cents"] = np.round(train["TransactionAmt"] - np.floor(train["TransactionAmt"]), 2)
test["cents"] = np.round(test["TransactionAmt"] - np.floor(test["TransactionAmt"]), 2)

v_features = [x for x in train.columns if x.find("V")!=-1]
v_features = v_features[:-12]

pca = PCA(n_components = 10)
pca.fit(train[v_features])

pca_frame = pd.DataFrame(pca.transform(train[v_features]))
pca_frame.rename(columns = lambda x: "PCA_" + str(x), inplace = True)

train = pd.concat([train, pca_frame], axis = 1)

pca_frame = pd.DataFrame(pca.transform(test[v_features]))
pca_frame.rename(columns = lambda x: "PCA_" + str(x), inplace = True)

test = pd.concat([test, pca_frame], axis = 1)

In [None]:
lgb_param = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.5,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':800,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': SEED,
                    'early_stopping_rounds':100, 
                }
nfolds = 20
SEED = 543210
folds = StratifiedKFold(n_splits=nfolds, shuffle=True, random_state=SEED)

feature_importance_df = np.zeros((train.shape[1], nfolds))
mvalid = np.zeros(len(train))
predictions  = np.zeros(len(test))
aucs = list()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, train.values)):
    print('----')
    print("fold n°{}".format(fold_))
    
    x0,y0 = train.iloc[trn_idx], y[trn_idx]
    x1,y1 = train.iloc[val_idx], y[val_idx]
    
    trn_data = lgb.Dataset(x0, label= y0); val_data = lgb.Dataset(x1, label= y1)
    
    clf = lgb.train(params, trn_data, 
                    num_round, valid_sets = [trn_data, val_data], 
                    verbose_eval=2000, 
                    early_stopping_rounds = 1000)
    
    mvalid[val_idx] = clf.predict(x1, num_iteration=clf.best_iteration)
    
    feature_importance_df[:, fold_] = clf.feature_importance()
    
    predictions += clf.predict(test, num_iteration=clf.best_iteration) / folds.n_splits
    aucs.append(clf.best_score['valid_1']['auc'])
print('Average ROC AUC Score {} [STD:{}]'.format(np.mean(aucs), np.std(aucs)))


In [None]:
ximp = pd.DataFrame()
ximp['feature'] = train.columns
ximp['importance'] = feature_importance_df.mean(axis = 1)
plt.figure(figsize=(10,120))
sns.barplot(x="importance",
            y="feature",
            data=ximp.sort_values(by="importance",
                                           ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()

In [None]:
# 2. MODELİMİZ (BU İKİ MODEL SONRADAN DIŞARIDAN ALINAN TAHMİN VERİLERİ İLE BLEND YAPILDI VE SİTEYE YÜKLENDİ)
y_train = train['isFraud'].copy()
train.drop(columns=['isFraud'], inplace=True)

In [None]:
SEED = 4321

lgb_param = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.5,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':800,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': SEED,
                    'early_stopping_rounds':100, 
                } 

In [None]:
N = 10
kf = KFold(n_splits=N)

importance = pd.DataFrame(np.zeros((train.shape[1], N)), columns=['Fold_{}'.format(i) for i in range(1, N + 1)], index=train.columns)
scores = []
y_pred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])

for fold, (trn_idx, val_idx) in enumerate(kf.split(train, y_train), 1):
    print('Fold {}'.format(fold))
          
    trn_data = lgb.Dataset(train.iloc[trn_idx, :].values, label=y_train.iloc[trn_idx].values)
    val_data = lgb.Dataset(train.iloc[val_idx, :].values, label=y_train.iloc[val_idx].values)   
    
    clf = lgb.train(lgb_param, trn_data, 10000, valid_sets=[trn_data, val_data], verbose_eval=500, early_stopping_rounds=500)

    predictions = clf.predict(train.iloc[val_idx, :].values) 
    importance.iloc[:, fold - 1] = clf.feature_importance()
    oof[val_idx] = predictions

    score = roc_auc_score(y_train.iloc[val_idx].values, predictions)
    scores.append(score)
    print('Fold {} ROC AUC Score {}\n'.format(fold, score))

    y_pred += clf.predict(test) / N
    
    del trn_data, val_data, predictions
    gc.collect()
    
print('Average ROC AUC Score {} [STD:{}]'.format(np.mean(scores), np.std(scores)))

In [None]:
importance['Mean_Importance'] = importance.sum(axis=1) / N
importance.sort_values(by='Mean_Importance', inplace=True, ascending=False)

plt.figure(figsize=(15, 120))
sns.barplot(x='Mean_Importance', y=importance.index, data=importance)

plt.xlabel('')
plt.tick_params(axis='x', labelsize=15)
plt.tick_params(axis='y', labelsize=15)
plt.title('Mean Feature Importance Between Folds', size=15)

plt.show()