データの読み込みと前処理を行うためのnotebookです。  
モデルの学習と予測にはここで処理をかけたデータを利用するようにして下さい。

## 必要なライブラリのimport

In [43]:
%pip install pandas
%pip install sqlalchemy
%pip install scikit-learn
%pip install imblearn
# %pip install matplotlib
# %pip install seaborn
# %pip install lightgbm

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip






[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [44]:
import warnings
import time
import sys
import datetime

import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta

warnings.simplefilter(action='ignore', category=FutureWarning)

## データの読み込み

In [45]:
def reduce_mem_usage(df, verbose=True):
    """
    データフレームのメモリ使用量を減らす。

    Parameters
    ----------
    df : pd.DataFrame
        メモリ使用量を削減したいデータフレーム。
    verbose : bool, optional
        メモリ使用量の削減結果を出力するかどうか（デフォルトは True）。

    Returns
    -------
    pd.DataFrame
        メモリ使用量が削減されたデータフレーム。
    """

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


def binarize(df):
    """
    指定された列を二値化する。

    Parameters
    ----------
    df : pd.DataFrame
        二値化対象のデータフレーム。

    Returns
    -------
    pd.DataFrame
        二値化されたデータフレーム。
    """

    for col in ['authorized_flag', 'category_1']:
        df[col] = df[col].map({'Y': 1, 'N': 0})
    return df


def read_data(input_file):
    """
    指定されたファイルからデータを読み込み、前処理を行う。

    Parameters
    ----------
    input_file : str
        読み込むデータファイルのパス。

    Returns
    -------
    pd.DataFrame
        前処理されたデータフレーム。
    """

    df = pd.read_csv(input_file)
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['elapsed_time'] = (pd.Timestamp('2018-02-01') - df['first_active_month']).dt.days
    return df

In [46]:
train = read_data('../data/raw/train.csv')
test = read_data('../data/raw/test.csv')

new_transactions = pd.read_csv('../data/raw/new_merchant_transactions.csv', parse_dates=['purchase_date'])

historical_transactions = pd.read_csv('../data/raw/historical_transactions.csv', parse_dates=['purchase_date'])

historical_transactions = binarize(historical_transactions)
new_transactions = binarize(new_transactions)

## 特徴量作成

In [47]:
def calculate_month_diff(transactions):
    """
    purchase_dateとmonth_lagを基にmonth_diffを計算する。

    Parameters
    ----------
    transactions : pd.DataFrame
        取引データのデータフレーム。

    Returns
    -------
    pd.DataFrame
        month_diff列が追加されたデータフレーム。
    """
    current_date = pd.Timestamp(datetime.datetime.today())
    transactions['month_diff'] = ((current_date - transactions['purchase_date']).dt.days) // 30
    transactions['month_diff'] += transactions['month_lag']
    return transactions


def encode_categorical_columns(df, columns):
    """
    指定されたカテゴリカル列をワンホットエンコーディングする。

    Parameters
    ----------
    df : pd.DataFrame
        エンコード対象のデータフレーム。
    columns : list of str
        エンコードするカテゴリカル列のリスト。

    Returns
    -------
    pd.DataFrame
        ワンホットエンコードされたデータフレーム。
    """
    # ダミー変数
    return pd.get_dummies(df, columns=columns)


def reduce_mem_usage(df, verbose=True):
    """
    データフレームのメモリ使用量を減らす。

    Parameters
    ----------
    df : pd.DataFrame
        メモリ使用量を削減したいデータフレーム。
    verbose : bool, optional
        メモリ使用量の削減結果を出力するかどうか（デフォルトは True）。

    Returns
    -------
    pd.DataFrame
        メモリ使用量が削減されたデータフレーム。
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


def aggregate_transactions(history):
    """
    取引データを集計する。

    Parameters
    ----------
    history : pd.DataFrame
        取引データのデータフレーム。

    Returns
    -------
    pd.DataFrame
        集計されたデータフレーム。
    """
    history.loc[:, 'purchase_date'] = pd.DatetimeIndex(history['purchase_date']).astype(np.int64) * 1e-9

    agg_func = {
        'category_1': ['sum', 'mean'],
        'category_2_1.0': ['mean'],
        'category_2_2.0': ['mean'],
        'category_2_3.0': ['mean'],
        'category_2_4.0': ['mean'],
        'category_2_5.0': ['mean'],
        'category_3_A': ['mean'],
        'category_3_B': ['mean'],
        'category_3_C': ['mean'],
        'merchant_id': ['nunique'],
        'merchant_category_id': ['nunique'],
        'state_id': ['nunique'],
        'city_id': ['nunique'],
        'subsector_id': ['nunique'],
        'purchase_amount': ['sum', 'mean', 'max', 'min', 'std'],
        'installments': ['sum', 'mean', 'max', 'min', 'std'],
        'purchase_month': ['mean', 'max', 'min', 'std'],
        'purchase_date': [np.ptp, 'min', 'max'],
        'month_lag': ['mean', 'max', 'min', 'std'],
        'month_diff': ['mean']
    }

    agg_history = history.groupby(['card_id']).agg(agg_func)
    agg_history.columns = ['_'.join(col).strip() for col in agg_history.columns.values]
    agg_history.reset_index(inplace=True)

    df = (history.groupby('card_id').size().reset_index(name='transactions_count'))

    agg_history = pd.merge(df, agg_history, on='card_id', how='left')

    return agg_history


def aggregate_per_month(history):
    """
    月ごとの取引データを集計する。

    Parameters
    ----------
    history : pd.DataFrame
        取引データのデータフレーム。

    Returns
    -------
    pd.DataFrame
        月ごとに集計されたデータフレーム。
    """
    grouped = history.groupby(['card_id', 'month_lag'])

    agg_func = {
        'purchase_amount': ['count', 'sum', 'mean', 'min', 'max', 'std'],
        'installments': ['count', 'sum', 'mean', 'min', 'max', 'std'],
    }

    intermediate_group = grouped.agg(agg_func)
    intermediate_group.columns = ['_'.join(col).strip() for col in intermediate_group.columns.values]
    intermediate_group.reset_index(inplace=True)

    final_group = intermediate_group.groupby('card_id').agg(['mean', 'std'])
    final_group.columns = ['_'.join(col).strip() for col in final_group.columns.values]
    final_group.reset_index(inplace=True)

    return final_group


def successive_aggregates(df, field1, field2):
    """
    指定されたフィールドを基に連続集計を行う。

    Parameters
    ----------
    df : pd.DataFrame
        取引データのデータフレーム。
    field1 : str
        集計の基準となるフィールド。
    field2 : str
        集計されるフィールド。

    Returns
    -------
    pd.DataFrame
        連続集計されたデータフレーム。
    """
    t = df.groupby(['card_id', field1])[field2].mean()
    u = pd.DataFrame(t).reset_index().groupby('card_id')[field2].agg(['mean', 'min', 'max', 'std'])
    u.columns = [field1 + '_' + field2 + '_' + col for col in u.columns.values]
    u.reset_index(inplace=True)
    return u


In [48]:
# データ準備
# 日付型に変換
historical_transactions['purchase_date'] = pd.to_datetime(historical_transactions['purchase_date'])
new_transactions['purchase_date'] = pd.to_datetime(new_transactions['purchase_date'])

# 月の差を計算
historical_transactions = calculate_month_diff(historical_transactions)
new_transactions = calculate_month_diff(new_transactions)

# カテゴリカル列をワンホットエンコーディング
historical_transactions = encode_categorical_columns(historical_transactions, ['category_2', 'category_3'])
new_transactions = encode_categorical_columns(new_transactions, ['category_2', 'category_3'])

# メモリ使用量の削減
historical_transactions = reduce_mem_usage(historical_transactions)
new_transactions = reduce_mem_usage(new_transactions)

# authorized_flagの平均を計算
agg_fun = {'authorized_flag': ['mean']}
auth_mean = historical_transactions.groupby(['card_id']).agg(agg_fun)
auth_mean.columns = ['_'.join(col).strip() for col in auth_mean.columns.values]
auth_mean.reset_index(inplace=True)

# authorized_flagに基づいてデータを分割
authorized_transactions = historical_transactions[historical_transactions['authorized_flag'] == 1]
historical_transactions = historical_transactions[historical_transactions['authorized_flag'] == 0]

# purchase_month列を追加
historical_transactions['purchase_month'] = historical_transactions['purchase_date'].dt.month
authorized_transactions['purchase_month'] = authorized_transactions['purchase_date'].dt.month
new_transactions['purchase_month'] = new_transactions['purchase_date'].dt.month

# データの集計
history = aggregate_transactions(historical_transactions)
history.columns = ['hist_' + c if c != 'card_id' else c for c in history.columns]

authorized = aggregate_transactions(authorized_transactions)
authorized.columns = ['auth_' + c if c != 'card_id' else c for c in authorized.columns]

new = aggregate_transactions(new_transactions)
new.columns = ['new_' + c if c != 'card_id' else c for c in new.columns]

# 月ごとのデータの集計
final_group = aggregate_per_month(authorized_transactions)

# 連続集計
additional_fields = successive_aggregates(new_transactions, 'category_1', 'purchase_amount')
additional_fields = additional_fields.merge(successive_aggregates(new_transactions, 'installments', 'purchase_amount'), on='card_id', how='left')
additional_fields = additional_fields.merge(successive_aggregates(new_transactions, 'city_id', 'purchase_amount'), on='card_id', how='left')
additional_fields = additional_fields.merge(successive_aggregates(new_transactions, 'category_1', 'installments'), on='card_id', how='left')


Mem. usage decreased to 1332.66 Mb (57.1% reduction)
Mem. usage decreased to 86.12 Mb (58.9% reduction)


In [49]:
# データの結合
train = pd.merge(train, history, on='card_id', how='left')
test = pd.merge(test, history, on='card_id', how='left')

train = pd.merge(train, authorized, on='card_id', how='left')
test = pd.merge(test, authorized, on='card_id', how='left')

train = pd.merge(train, new, on='card_id', how='left')
test = pd.merge(test, new, on='card_id', how='left')

train = pd.merge(train, final_group, on='card_id', how='left')
test = pd.merge(test, final_group, on='card_id', how='left')

train = pd.merge(train, auth_mean, on='card_id', how='left')
test = pd.merge(test, auth_mean, on='card_id', how='left')

train = pd.merge(train, additional_fields, on='card_id', how='left')
test = pd.merge(test, additional_fields, on='card_id', how='left')

In [50]:
# train.shape

(201917, 164)

In [51]:
# 再読み込み
new_transactions = pd.read_csv('../data/raw/new_merchant_transactions.csv', parse_dates=['purchase_date'])

historical_transactions = pd.read_csv('../data/raw/historical_transactions.csv', parse_dates=['purchase_date'])

In [52]:
historical_transactions = reduce_mem_usage(historical_transactions)
new_transactions = reduce_mem_usage(new_transactions)

Mem. usage decreased to 1749.11 Mb (43.7% reduction)
Mem. usage decreased to 114.20 Mb (45.5% reduction)


In [53]:

def calculate_month_diff(df):
    """
    DataFrame `df` に対して月差分を計算し、新しい列として追加した後、
    card_idごとに月差分の統計量（平均、最小値、最大値）を計算して返す関数。

    Parameters:
    - df (pd.DataFrame): 月差分を計算する対象のデータフレーム。'purchase_date' 列がdatetime型であることが前提。

    Returns:
    - pd.DataFrame: card_idごとに計算された月差分の統計量（'card_id', 'month_elapsed_mean', 'month_elapsed_min', 'month_elapsed_max' を含む）を持つデータフレーム。
    """
    # 現在の日付を取得し、タイムスタンプに変換
    current_date = pd.Timestamp(datetime.datetime.today())

    # purchase_date 列をdatetime型に変換
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])

    # 月差分を計算し、新しい列として追加
    df['month_elapsed'] = df['purchase_date'].apply(lambda x: relativedelta(current_date, x).months)

    # 月の異常な取引を算出
    df['month_elapsed'] += df['month_lag']

    # card_idごとにmonth_elapsedの計算
    month_diff_mean = df.groupby('card_id')['month_elapsed'].agg(['mean', 'min', 'max']).astype('float32').reset_index()

    # 列名をリネームして返す
    month_diff_result = month_diff_mean.rename(columns={
        'mean': 'month_elapsed_mean',
        'min': 'month_elapsed_min',
        'max': 'month_elapsed_max'
        # 'std': 'month_elapsed_std'
    })

    return month_diff_result

In [54]:
# historical_transactionsに関しては'authorized_flag'がNのデータを使用して算出する
fill0_history = historical_transactions[historical_transactions['authorized_flag'] == "N"].copy()

history_month_diff = calculate_month_diff(fill0_history)
# new_month_diff = calculate_month_diff(new_transactions)

# カラム名を変更
# 正しいカラム名のリストに修正する
history_month_diff.columns = ['card_id', 'history_month_elapsed_mean', 'history_month_elapsed_min', 'history_month_elapsed_max']
# new_month_diff.columns = ['card_id', 'new_month_elapsed_mean', 'new_month_elapsed_min', 'new_month_elapsed_max']
# new_month_diff.head()

In [55]:
# 結合
train = pd.merge(train, history_month_diff, on='card_id', how='left')
test = pd.merge(test, history_month_diff, on='card_id', how='left')

# train = pd.merge(train, new_month_diff, on='card_id', how='left')
# test = pd.merge(test, new_month_diff, on='card_id', how='left')

In [56]:
# 追加の特徴量[2]
# 最後に購入した日の購入額
# 最後に購入した日

def last_purcase_amount(df):
    last_purchase_date = df.groupby('card_id')['purchase_date'].max().reset_index()

    # 最終購入日の金額を取得する
    merge_last_purchase = pd.merge(last_purchase_date, df[['card_id', 'purchase_date', 'purchase_amount']], on=['card_id', 'purchase_date'], how='inner')
    umerge_last_purchase = merge_last_purchase.groupby('card_id').agg({'purchase_amount': 'mean'}).reset_index()
    return umerge_last_purchase
# re_last_purchase.drop(columns=['purcahse_date'], inplace=True)

In [57]:
history_amount = last_purcase_amount(historical_transactions)
# new_amount = last_purcase_amount(new_transactions)

# 不要な行を削除
# history_amount = history_amount.drop(columns=['purchase_date'])
# new_amount = new_amount.drop(columns=['purchase_date'])

# カラム名の変更
history_amount.columns = ['card_id', 'history_last_amount']
# new_amount.columns = ['card_id', 'new_last_amount']

In [58]:
# 結合
train = pd.merge(train, history_amount, on='card_id', how='left')
test = pd.merge(test, history_amount, on='card_id', how='left')

# train = pd.merge(train, new_amount, on='card_id', how='left')
# test = pd.merge(test, new_amount, on='card_id', how='left')

In [59]:
# card_idの重複確認
# duplicate_cards = re_last_purchase[train.duplicated('card_id')]
# print("Duplicate card_ids:\n", duplicate_cards['card_id'].unique())

In [1]:
# 追加の特徴量[3]
# 一番取引額が多い金額
def max_amount(df):
    most_amount = df.groupby('card_id')['purchase_amount'].agg({'max', 'min'}).astype('float32').reset_index()
    most_amount.rename(columns={'max': 'max_purchase_amount', 'min': 'min_purchase_amount'}, inplace=True)
    return most_amount

In [61]:
history_amount = max_amount(historical_transactions)
# new_max_amount = max_amount(new_transactions)

history_amount.columns = ['card_id', 'history_max_purchase_amount', 'history_min_purchase_amount']
# new_max_amount.columns = ['card_id', 'new_max_purchase_amount', 'new_max_purchase_amount']

In [62]:
train = pd.merge(train, history_amount, on='card_id', how='left')
test = pd.merge(test, history_amount, on='card_id', how='left')

# train = pd.merge(train, new_max_amount, on='card_id', how='left')
# test = pd.merge(test, new_max_amount, on='card_id', how='left')

In [63]:
# 最高購入金額 - 最小購入金額

def purchase_amount_diff(df):
    amount_diff = df.groupby('card_id')['purchase_amount'].agg({'max', 'min'}).reset_index()
    amount_diff['max_min_amount_dff'] = amount_diff['max'] - amount_diff['min']
    diff_amount = amount_diff[['card_id', 'max_min_amount_dff']]
    return diff_amount

In [64]:
history_amount_diff = purchase_amount_diff(historical_transactions)
# new_amount_diff = purchase_amount_diff(new_transactions)

history_amount_diff.columns = ['card_id', 'history_max_min_amount']
# new_amount_diff.columns = ['card_id', 'new_max_min_amount']

In [65]:
train = pd.merge(train, history_amount_diff, on='card_id', how='left')
test = pd.merge(test, history_amount_diff, on='card_id', how='left')

# train = pd.merge(train, new_amount_diff, on='card_id', how='left')
# test = pd.merge(test, new_amount_diff, on='card_id', how='left')

In [66]:
# 追加の特徴量[5]
# purchase_date を日付型に変換
historical_transactions['purchase_date'] = pd.to_datetime(historical_transactions['purchase_date'])

# 今日の日付を取得
current_date = pd.Timestamp(datetime.datetime.today())

# first_last_dates を作成
first_last_dates = historical_transactions.groupby('card_id').agg({
    'purchase_date': ['min', 'max']
}).reset_index()

# 取引期間を計算
first_last_dates.columns = ['card_id', 'first_purchase_date', 'last_purchase_date']
first_last_dates['transaction_days'] = (first_last_dates['last_purchase_date'] - first_last_dates['first_purchase_date']).dt.days

# month_lag はそのまま使う
df_his = historical_transactions[['card_id', 'purchase_date', 'month_lag']]

# 不要な列を削除
del historical_transactions

# 月差分（month_diff）を計算し、新しい列として追加
df_his['month_diff'] = (((current_date - df_his['purchase_date']).dt.days) // 30).astype('int16')
# 月の異常な取引を算出
df_his['month_diff'] += df_his['month_lag']

# 不要な列を削除
df_his.drop(columns=['purchase_date', 'month_lag'], inplace=True)

# card_id ごとに month_diff の平均値を計算
df_his_mean = df_his.groupby('card_id').agg({'month_diff': 'mean'}).astype('float32').reset_index()

# killer_feature を作成
killer_feature = pd.merge(df_his_mean, first_last_dates, on='card_id', how='inner')
killer_feature['kil_feature'] = killer_feature['month_diff'] / killer_feature['transaction_days']
killer_feature['kil_feature'] = killer_feature['kil_feature'].astype('float32')

# 不要な列を削除
killer_feature.drop(columns=['first_purchase_date', 'month_diff', 'last_purchase_date', 'transaction_days'], inplace=True)

In [67]:
# 追加した特徴量のデータを結合
train = pd.merge(train, killer_feature, on='card_id', how='left')
test = pd.merge(test, killer_feature, on='card_id', how='left')

In [68]:
# # card_idの重複確認
# duplicate_cards = train[train.duplicated('card_id')]
# print("Duplicate card_ids:\n", duplicate_cards['card_id'].unique())

In [69]:
# train.shape

In [70]:
# historical_transactions = pd.read_csv('../data/raw/historical_transactions.csv', parse_dates=['purchase_date'])
# historical_transactions = reduce_mem_usage(historical_transactions)

In [71]:
# def add_weekday_flags(data):
#     # 曜日を数字に変換
#     data['purchase_date'] = pd.to_datetime(data['purchase_date'])
#     data['purchase_dayofweek'] = data['purchase_date'].dt.dayofweek.astype('int32')
#     # 曜日フラグを立てる
#     # 月曜日フラグ
#     data = (data
#         .assign(flag_monday=(data['purchase_dayofweek'] == 0).astype('int32')))
#     # # 火曜日フラグ
#     data = (data
#         .assign(flag_tuesday=(data['purchase_dayofweek'] == 1).astype('int32')))
#     # # 水曜日フラグ
#     data = (data
#         .assign(flag_wednesday=(data['purchase_dayofweek'] == 2).astype('int32')))
#     # # 木曜日フラグ
#     data = (data
#         .assign(flag_thursday=(data['purchase_dayofweek'] == 3).astype('int32')))
#     # # 金曜日フラグ
#     data = (data
#         .assign(flag_friday=(data['purchase_dayofweek'] == 4).astype('int32')))
#     # 土曜日フラグ
#     data = (data
#         .assign(flag_saturday=(data['purchase_dayofweek'] == 5).astype('int32')))
#     # 日曜日フラグ
#     data = (data
#         .assign(flag_sunday=(data['purchase_dayofweek'] == 6).astype('int32')))
#     return data

# df = historical_transactions[['card_id', 'purchase_date']]

# # 関数を適用して曜日フラグを追加
# week = add_weekday_flags(df)
# df_week_flag = week[['card_id','flag_monday', 'flag_tuesday', 'flag_wednesday', 'flag_thursday', 'flag_friday', 'flag_saturday', 'flag_sunday']]
# df_week_flag.head()
# mode_week = week.groupby('card_id')['purchase_dayofweek'].agg(lambda x: x.mode().iloc[0]).reset_index()
# week = pd.merge(week, mode_week, on='card_id', how='left')

# 結果確認
# week[['card_id', 'purchase_date', 'flag_monday', 'flag_tuesday', 'flag_wednesday', 'flag_thursday', 'flag_friday', 'flag_saturday', 'flag_sunday']].head(10)


In [72]:
# df_week_flag = df_week_flag.groupby('card_id').agg(
#     sum_flag_monday=('flag_monday', 'sum'),
#     sum_flag_tuesday=('flag_tuesday', 'sum'),
#     sum_flag_wednesday=('flag_wednesday', 'sum'),
#     sum_flag_thursday=('flag_thursday', 'sum'),
#     sum_flag_friday=('flag_friday', 'sum'),
#     sum_flag_saturday=('flag_saturday', 'sum'),
#     sum_flag_sunday=('flag_sunday', 'sum')
# ).reset_index()
# df_week_flag.head(30)

In [73]:
# train = pd.merge(train, df_week_flag, on='card_id', how='left')
# test = pd.merge(test, df_week_flag, on='card_id', how='left')

In [74]:
train.shape

(201917, 172)

## 前処理終了後のデータの保存
- 基本的にモデルの学習・ハイパーパラメータチューニングを行う際にはここで作成した同じデータを使い回して下さい。
- 適宜前処理を変更した場合はファイル名を変えるなどして管理して下さい。

In [77]:
# データの保存
train.to_csv('../data/processed/processed20240626_2_train.csv',index=None)
test.to_csv('../data/processed/processed20240626_2_test.csv',index=None)