# Data Filtering

In [1]:
import pandas as pd

df_member = pd.read_csv('data/members_v3.csv')
df_transaction = pd.read_csv('data/transactions.csv')
df_transaction2 = pd.read_csv('data/transactions_v2.csv')
df_logs = pd.read_csv('data/user_logs.csv')
df_logs2 = pd.read_csv('data/user_logs_v2.csv')

In [2]:
all_df_transaction = pd.concat([df_transaction, df_transaction2], axis=0)
all_df_transaction.shape

(22978755, 9)

In [4]:
print(f"members: {df_member.shape}")
print(f"transactions: {all_df_transaction.shape}")
print(f"logs: {df_logs.shape}")
print(f"logs2: {df_logs2.shape}")

members: (6769473, 6)
transactions: (22978755, 9)
logs: (392106543, 9)
logs2: (18396362, 9)


In [3]:
users_with_both_cancel_states = all_df_transaction.groupby('msno')['is_cancel'].agg(set)
filtered_msno_initial = users_with_both_cancel_states[users_with_both_cancel_states == {0, 1}].index.to_list()

sorted_df = all_df_transaction.sort_values(by=['msno', 'transaction_date'], ascending=[True, False])
latest_transactions = sorted_df.drop_duplicates(subset=['msno'], keep='first')
users_to_exclude = latest_transactions[latest_transactions['is_cancel'] == 1]['msno'].to_list()

filtered_msno_set = set(filtered_msno_initial)
users_to_exclude_set = set(users_to_exclude)
filtered_msno = list(filtered_msno_set - users_to_exclude_set)
print(len(filtered_msno))

282531


In [5]:
filtered_df_member = df_member[df_member['msno'].isin(filtered_msno)].sample(n=150000, random_state=42)
filtered_df_member2 = df_member[~df_member['msno'].isin(filtered_msno)].sample(n=150000, random_state=42)
filtered_df_member_all = pd.concat([filtered_df_member, filtered_df_member2])
filtered_df_member_all

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time
1108581,2dwuk0KQUnCnRvFnDGFyIslHLqG8lFHkyOvfRNwc5XI=,5,0,female,9,20120309
2860696,Bx13MoR+nVJVVDsatg3fxInFupmO1jSzdkFKMA976t8=,13,33,female,7,20130422
685587,2ouWnOhbWxdFyjEy8qx/8lr15OzZ0PX9tr6xZP4Ihns=,1,0,,7,20140629
3381701,QvZtxnHz7yGh07QiFuq0RN1hL5WrzcbKlQTUiroEEyE=,13,0,,9,20110528
5145811,CUFR22oaKwIwzj4VMey3Mj76B5zawuo2PxsWheNfQ6Q=,5,42,male,9,20141128
...,...,...,...,...,...,...
4459643,u2ZFhHBKeaNkXR5IWV88Id6zd5U6tc7Oxv/kVZ1ZQ/s=,1,0,,4,20161020
5464972,Z3QAHHe2PwgA1TZM7aN7yIO4M6OoT2xdvccutBe6SmE=,1,0,,4,20170311
705634,Ut82CPYSqaueUjOrv47f9knp2qKcGGUTj3h+oXFBCy8=,22,24,female,9,20130110
3134719,S84wwJdls1Zzv7+dKkhVgU1bbL6Laoo/tPNdFmzHlaA=,1,0,,4,20151022


In [6]:
filtered_df_transaction = all_df_transaction[all_df_transaction['msno'].isin(filtered_df_member_all['msno'])].drop_duplicates()
filtered_df_transaction.shape

(3406383, 9)

In [8]:
filtered_df_transaction.to_csv('data/transactions_filtered.csv')
filtered_df_member_all.to_csv('data/members_filtered.csv')

In [9]:
filtered_df_logs = df_logs[df_logs['msno'].isin(filtered_df_member_all['msno'])]
filtered_df_logs2 = df_logs2[df_logs2['msno'].isin(filtered_df_member_all['msno'])]
all_df_logs = pd.concat([filtered_df_logs , filtered_df_logs2], axis=0).drop_duplicates()
all_df_logs.shape

(61325062, 9)

In [10]:
all_df_logs.to_csv('data/user_logs_filtered.csv')

In [11]:
print(f"members: {filtered_df_member_all.shape}")
print(f"transactions: {filtered_df_transaction.shape}")
print(f"logs: {all_df_logs.shape}")

members: (300000, 6)
transactions: (3406383, 9)
logs: (61325062, 9)
