データの読み込みと前処理を行うためのnotebookです。  
モデルの学習と予測にはここで処理をかけたデータを利用するようにして下さい。

## 必要なライブラリのimport

In [31]:
import warnings
import time
import sys
import datetime

import numpy as np
import pandas as pd

warnings.simplefilter(action='ignore', category=FutureWarning)

## データの読み込み

In [32]:
def reduce_mem_usage(df, verbose=True):
    """
    データフレームのメモリ使用量を減らす。

    Parameters
    ----------
    df : pd.DataFrame
        メモリ使用量を削減したいデータフレーム。
    verbose : bool, optional
        メモリ使用量の削減結果を出力するかどうか（デフォルトは True）。

    Returns
    -------
    pd.DataFrame
        メモリ使用量が削減されたデータフレーム。
    """

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


def binarize(df):
    """
    指定された列を二値化する。

    Parameters
    ----------
    df : pd.DataFrame
        二値化対象のデータフレーム。

    Returns
    -------
    pd.DataFrame
        二値化されたデータフレーム。
    """

    for col in ['authorized_flag', 'category_1']:
        df[col] = df[col].map({'Y': 1, 'N': 0})
    return df


def read_data(input_file):
    """
    指定されたファイルからデータを読み込み、前処理を行う。

    Parameters
    ----------
    input_file : str
        読み込むデータファイルのパス。

    Returns
    -------
    pd.DataFrame
        前処理されたデータフレーム。
    """
    
    df = pd.read_csv(input_file)
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['elapsed_time'] = (pd.Timestamp('2018-02-01') - df['first_active_month']).dt.days
    return df

In [33]:
train = read_data('../data/raw/train.csv')
test = read_data('../data/raw/test.csv')

new_transactions = pd.read_csv('../data/raw/new_merchant_transactions.csv',
                               parse_dates=['purchase_date'])

historical_transactions = pd.read_csv('../data/raw/historical_transactions.csv',
                                      parse_dates=['purchase_date'])

historical_transactions = binarize(historical_transactions)
new_transactions = binarize(new_transactions)

In [None]:

df_merge = pd.merge(train, history, on='card_id', how='left')

new_transactions = pd.read_csv('../data/raw/new_merchant_transactions.csv',
                               parse_dates=['purchase_date'])

historical_transactions = pd.read_csv('../data/raw/historical_transactions.csv',
                                      parse_dates=['purchase_date'])

historical_transactions = binarize(historical_transactions)
new_transactions = binarize(new_transactions)

## 特徴量作成

In [34]:
def calculate_month_diff(transactions):
    """
    purchase_dateとmonth_lagを基にmonth_diffを計算する。
    
    Parameters
    ----------
    transactions : pd.DataFrame
        取引データのデータフレーム。
    
    Returns
    -------
    pd.DataFrame
        month_diff列が追加されたデータフレーム。
    """
    current_date = pd.Timestamp(datetime.datetime.today())
    transactions['month_diff'] = ((current_date - transactions['purchase_date']).dt.days) // 30
    transactions['month_diff'] += transactions['month_lag']
    return transactions


def encode_categorical_columns(df, columns):
    """
    指定されたカテゴリカル列をワンホットエンコーディングする。
    
    Parameters
    ----------
    df : pd.DataFrame
        エンコード対象のデータフレーム。
    columns : list of str
        エンコードするカテゴリカル列のリスト。
    
    Returns
    -------
    pd.DataFrame
        ワンホットエンコードされたデータフレーム。
    """
    return pd.get_dummies(df, columns=columns)


def reduce_mem_usage(df, verbose=True):
    """
    データフレームのメモリ使用量を減らす。

    Parameters
    ----------
    df : pd.DataFrame
        メモリ使用量を削減したいデータフレーム。
    verbose : bool, optional
        メモリ使用量の削減結果を出力するかどうか（デフォルトは True）。

    Returns
    -------
    pd.DataFrame
        メモリ使用量が削減されたデータフレーム。
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


def aggregate_transactions(history):
    """
    取引データを集計する。
    
    Parameters
    ----------
    history : pd.DataFrame
        取引データのデータフレーム。
    
    Returns
    -------
    pd.DataFrame
        集計されたデータフレーム。
    """
    history.loc[:, 'purchase_date'] = pd.DatetimeIndex(history['purchase_date']).astype(np.int64) * 1e-9
    
    agg_func = {
        'category_1': ['sum', 'mean'],
        'category_2_1.0': ['mean'],
        'category_2_2.0': ['mean'],
        'category_2_3.0': ['mean'],
        'category_2_4.0': ['mean'],
        'category_2_5.0': ['mean'],
        'category_3_A': ['mean'],
        'category_3_B': ['mean'],
        'category_3_C': ['mean'],
        'merchant_id': ['nunique'],
        'merchant_category_id': ['nunique'],
        'state_id': ['nunique'],
        'city_id': ['nunique'],
        'subsector_id': ['nunique'],
        'purchase_amount': ['sum', 'mean', 'max', 'min', 'std'],
        'installments': ['sum', 'mean', 'max', 'min', 'std'],
        'purchase_month': ['mean', 'max', 'min', 'std'],
        'purchase_day': ['mean', 'max', 'min', 'std'], # 購入日
        'purchase_dayofweek': ['mean', 'max', 'min', 'std'], # 購入曜日
        'is_weekend': ['mean', 'max', 'min', 'std'], # 週末購入フラグ
        'purchase_date': [np.ptp, 'min', 'max'],
        'month_lag': ['mean', 'max', 'min', 'std'],
        'month_diff': ['mean']
    }
    
    agg_history = history.groupby(['card_id']).agg(agg_func)
    agg_history.columns = ['_'.join(col).strip() for col in agg_history.columns.values]
    agg_history.reset_index(inplace=True)
    
    df = (history.groupby('card_id')
          .size()
          .reset_index(name='transactions_count'))
    
    agg_history = pd.merge(df, agg_history, on='card_id', how='left')
    
    return agg_history


def aggregate_per_month(history):
    """
    月ごとの取引データを集計する。
    
    Parameters
    ----------
    history : pd.DataFrame
        取引データのデータフレーム。
    
    Returns
    -------
    pd.DataFrame
        月ごとに集計されたデータフレーム。
    """
    grouped = history.groupby(['card_id', 'month_lag'])

    agg_func = {
        'purchase_amount': ['count', 'sum', 'mean', 'min', 'max', 'std'],
        'installments': ['count', 'sum', 'mean', 'min', 'max', 'std'],
    }

    intermediate_group = grouped.agg(agg_func)
    intermediate_group.columns = ['_'.join(col).strip() for col in intermediate_group.columns.values]
    intermediate_group.reset_index(inplace=True)

    final_group = intermediate_group.groupby('card_id').agg(['mean', 'std'])
    final_group.columns = ['_'.join(col).strip() for col in final_group.columns.values]
    final_group.reset_index(inplace=True)
    
    return final_group


def successive_aggregates(df, field1, field2):
    """
    指定されたフィールドを基に連続集計を行う。
    
    Parameters
    ----------
    df : pd.DataFrame
        取引データのデータフレーム。
    field1 : str
        集計の基準となるフィールド。
    field2 : str
        集計されるフィールド。
    
    Returns
    -------
    pd.DataFrame
        連続集計されたデータフレーム。
    """
    t = df.groupby(['card_id', field1])[field2].mean()
    u = pd.DataFrame(t).reset_index().groupby('card_id')[field2].agg(['mean', 'min', 'max', 'std'])
    u.columns = [field1 + '_' + field2 + '_' + col for col in u.columns.values]
    u.reset_index(inplace=True)
    return u


In [35]:
def add_time_features(df, date_col):
    # df = df['card_id']
    df['purchase_month'] = df[date_col].dt.month
    df['purchase_day'] = df[date_col].dt.day
    df['purchase_dayofweek'] = df[date_col].dt.dayofweek
    df['is_weekend'] = (df[date_col].dt.weekday >= 5).astype(int)
    return df

# データ準備
historical_transactions['purchase_date'] = pd.to_datetime(historical_transactions['purchase_date'])
new_transactions['purchase_date'] = pd.to_datetime(new_transactions['purchase_date'])

# authorized_flagに基づいてデータを分割
authorized_transactions = historical_transactions[historical_transactions['authorized_flag'] == 1]
historical_transactions = historical_transactions[historical_transactions['authorized_flag'] == 0]

# 時間関連の特徴量作成
historical_transactions = add_time_features(historical_transactions, 'purchase_date')
new_transactions = add_time_features(new_transactions, 'purchase_date')
authorized_transactions = add_time_features(authorized_transactions, 'purchase_date')

In [36]:
# 月の差を計算
historical_transactions = calculate_month_diff(historical_transactions)
new_transactions = calculate_month_diff(new_transactions)

# カテゴリカル列をワンホットエンコーディング
historical_transactions = encode_categorical_columns(historical_transactions, ['category_2', 'category_3'])
new_transactions = encode_categorical_columns(new_transactions, ['category_2', 'category_3'])

# メモリ使用量の削減
historical_transactions = reduce_mem_usage(historical_transactions)
new_transactions = reduce_mem_usage(new_transactions)

# authorized_flagの平均を計算
agg_fun = {'authorized_flag': ['mean']}
auth_mean = historical_transactions.groupby(['card_id']).agg(agg_fun)
auth_mean.columns = ['_'.join(col).strip() for col in auth_mean.columns.values]
auth_mean.reset_index(inplace=True)

# authorized_flagに基づいてデータを分割
authorized_transactions = historical_transactions[historical_transactions['authorized_flag'] == 1]
historical_transactions = historical_transactions[historical_transactions['authorized_flag'] == 0]

# purchase_month列を追加
historical_transactions['purchase_month'] = historical_transactions['purchase_date'].dt.month
authorized_transactions['purchase_month'] = authorized_transactions['purchase_date'].dt.month
new_transactions['purchase_month'] = new_transactions['purchase_date'].dt.month

# データの集計
history = aggregate_transactions(historical_transactions)
history.columns = ['hist_' + c if c != 'card_id' else c for c in history.columns]

authorized = aggregate_transactions(authorized_transactions)
authorized.columns = ['auth_' + c if c != 'card_id' else c for c in authorized.columns]

new = aggregate_transactions(new_transactions)
new.columns = ['new_' + c if c != 'card_id' else c for c in new.columns]

# 月ごとのデータの集計
final_group = aggregate_per_month(authorized_transactions)

# 連続集計
additional_fields = successive_aggregates(new_transactions, 'category_1', 'purchase_amount')
additional_fields = additional_fields.merge(successive_aggregates(new_transactions, 'installments', 'purchase_amount'), on='card_id', how='left')
additional_fields = additional_fields.merge(successive_aggregates(new_transactions, 'city_id', 'purchase_amount'), on='card_id', how='left')
additional_fields = additional_fields.merge(successive_aggregates(new_transactions, 'category_1', 'installments'), on='card_id', how='left')


Mem. usage decreased to 144.02 Mb (55.9% reduction)
Mem. usage decreased to 93.60 Mb (60.9% reduction)


In [37]:
# データの結合
train = pd.merge(train, history, on='card_id', how='left')
test = pd.merge(test, history, on='card_id', how='left')

train = pd.merge(train, authorized, on='card_id', how='left')
test = pd.merge(test, authorized, on='card_id', how='left')

train = pd.merge(train, new, on='card_id', how='left')
test = pd.merge(test, new, on='card_id', how='left')

train = pd.merge(train, final_group, on='card_id', how='left')
test = pd.merge(test, final_group, on='card_id', how='left')

train = pd.merge(train, auth_mean, on='card_id', how='left')
test = pd.merge(test, auth_mean, on='card_id', how='left')

train = pd.merge(train, additional_fields, on='card_id', how='left')
test = pd.merge(test, additional_fields, on='card_id', how='left')

In [38]:
train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,elapsed_time,hist_transactions_count,hist_category_1_sum,hist_category_1_mean,...,installments_purchase_amount_max,installments_purchase_amount_std,city_id_purchase_amount_mean,city_id_purchase_amount_min,city_id_purchase_amount_max,city_id_purchase_amount_std,category_1_installments_mean,category_1_installments_min,category_1_installments_max,category_1_installments_std
0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820283,245,13.0,0.0,0.0,...,-0.575822,,-0.459,-0.606574,-0.296143,0.155779,0.0,0.0,0.0,
1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392913,396,11.0,2.0,0.181818,...,-0.725911,,-0.725911,-0.725911,-0.725911,,1.0,1.0,1.0,
2,2016-08-01,C_ID_d639edf6cd,2,2,0,0.688056,549,2.0,0.0,0.0,...,-0.700195,,-0.700195,-0.700195,-0.700195,,0.0,0.0,0.0,
3,2017-09-01,C_ID_186d6a6901,4,3,0,0.142495,153,,,,...,-0.566895,0.080908,-0.664185,-0.665283,-0.663086,0.001554,0.833333,0.666667,1.0,0.235702
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,-0.159749,92,5.0,3.0,0.6,...,-0.175903,0.199291,-0.534717,-0.671143,-0.326765,0.150666,1.220588,0.941176,1.5,0.395148


In [39]:
train.shape

(201917, 200)

In [40]:
test.shape

(123623, 199)

## 前処理終了後のデータの保存
- 基本的にモデルの学習・ハイパーパラメータチューニングを行う際にはここで作成した同じデータを使い回して下さい。
- 適宜前処理を変更した場合はファイル名を変えるなどして管理して下さい。

In [41]:
# データの保存
train.to_csv('../data/processed/processed20240626_train.csv',index=None)
test.to_csv('../data/processed/processed20240626_test.csv',index=None)

In [42]:
# 追加の特徴量[1]
# authorized_flag=0だけでmonth_diffの平均を取る
# authorized_flag=0のフィルターを作成
filter_authorized = historical_transactions[historical_transactions['authorized_flag'] == "N"].copy()
# 現在の日時を取得
current_date = pd.Timestamp(datetime.datetime.today())
filter_authorized['purchase_date'] = pd.to_datetime(filter_authorized['purchase_date'])

# 月差分（month_diff）を計算し、新しい列として追加
filter_authorized['fil0_month_diff'] = (((current_date - filter_authorized['purchase_date']).dt.days) // 30).astype('int32')

# 月の異常な取引を算出
filter_authorized['fil0_month_diff'] += filter_authorized['month_lag']
# card_idごとにmonth_diffの平均値をとる
month_diff_mean = filter_authorized.groupby('card_id')['fil0_month_diff'].agg(['mean', 'min', 'max']).astype('float32').reset_index()
month_diff = month_diff_mean.rename(columns={
    'mean': 'fill0_month_diff_mean',
    'min': 'fill0_month_diff_min',
    'max': 'fill0_month_dfll_max'
}).reset_index()
# month_diff.head()

# 追加の特徴量[2]
# 最後に購入した日の購入額
# 最後に購入した日
last_purchase_date = historical_transactions.groupby('card_id')['purchase_date'].max().reset_index()

# 最終購入日の金額を取得する
merge_last_purchase = pd.merge(last_purchase_date, historical_transactions[['card_id', 'purchase_date', 'purchase_amount']], on=['card_id', 'purchase_date'], how='inner')

# card_id, purchase_dateごとのpurchase_amountの最大値、最小値、中央値、平均値を算出する。
merge_last_purchase_agg = merge_last_purchase.groupby(['card_id', 'purchase_date'])['purchase_amount'].agg(['mean', 'min', 'max']).astype('float32').reset_index()

# カラム名を変更する
re_last_purchase = merge_last_purchase_agg.rename(columns={
    'mean': 'last_purchase_amount_mean',
    'min': 'last_purchase_amount_min',
    'max': 'last_purchase_amount_max'
}).reset_index()
df_last_purchase = re_last_purchase[['card_id', 'last_purchase_amount_mean', 'last_purchase_amount_min', 'last_purchase_amount_max']]

# 追加の特徴量[3]
# 一番取引額が多い
max_amount = historical_transactions.groupby('card_id').agg({'purchase_amount': 'max'}).astype('float32').reset_index()
max_amount.rename(columns={'purchase_amount': 'max_purchase_amount'}, inplace=True)

# 追加の特徴量[4]
# 最高購入金額 - 最小購入金額
amount_diff = historical_transactions.groupby('card_id')['purchase_amount'].agg({'max', 'min'}).reset_index()
amount_diff['max_min_amount_dff'] = amount_diff['max'] - amount_diff['min']
diff_amount = amount_diff[['card_id', 'max_min_amount_dff']]

# 追加の特徴量[5]
# purchase_date を日付型に変換
historical_transactions['purchase_date'] = pd.to_datetime(historical_transactions['purchase_date'])

# 今日の日付を取得
current_date = pd.Timestamp(datetime.datetime.today())

# first_last_dates を作成
first_last_dates = historical_transactions.groupby('card_id').agg({
    'purchase_date': ['min', 'max']
}).reset_index()

# 取引期間を計算
first_last_dates.columns = ['card_id', 'first_purchase_date', 'last_purchase_date']
first_last_dates['transaction_days'] = (first_last_dates['last_purchase_date'] - first_last_dates['first_purchase_date']).dt.days

# month_lag はそのまま使う
df_his = historical_transactions[['card_id', 'purchase_date', 'month_lag']]

# 不要な列を削除
del historical_transactions

# 月差分（month_diff）を計算し、新しい列として追加
df_his['month_diff'] = (((current_date - df_his['purchase_date']).dt.days) // 30).astype('int16')
df_his['month_diff'] += df_his['month_lag']

# 不要な列を削除
df_his.drop(columns=['purchase_date', 'month_lag'], inplace=True)

# card_id ごとに month_diff の平均値を計算
df_his_mean = df_his.groupby('card_id').agg({'month_diff': 'mean'}).astype('float32').reset_index()

# killer_feature を作成
killer_feature = pd.merge(df_his_mean, first_last_dates, on='card_id', how='inner')
killer_feature['kil_feature'] = killer_feature['month_diff'] / killer_feature['transaction_days']
killer_feature['kil_feature'] = killer_feature['kil_feature'].astype('float32')

# 不要な列を削除
killer_feature.drop(columns=['first_purchase_date', 'month_diff', 'last_purchase_date', 'transaction_days'], inplace=True)

In [43]:
# データの結合
train = pd.merge(train, month_diff, on='card_id', how='left')
test = pd.merge(test, month_diff, on='card_id', how='left')

train = pd.merge(train, df_last_purchase, on='card_id', how='left')
test = pd.merge(test, df_last_purchase, on='card_id', how='left')

train = pd.merge(train, max_amount, on='card_id', how='left')
test = pd.merge(test, max_amount, on='card_id', how='left')

train = pd.merge(train, diff_amount, on='card_id', how='left')
test = pd.merge(test, diff_amount, on='card_id', how='left')

train = pd.merge(train, killer_feature, on='card_id', how='left')
test = pd.merge(test, killer_feature, on='card_id', how='left')

In [44]:
train.shape

(201917, 210)

In [45]:
test.shape

(123623, 209)

In [46]:
# データの保存
train.to_csv('../data/processed/processed20240626_train.csv',index=None)
test.to_csv('../data/processed/processed20240626_test.csv',index=None)