In [97]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import re

In [98]:
df_train = pd.read_csv(r'..\..\data\general_datasets\train.csv', encoding = 'latin1', low_memory = False, nrows = 500000)

In [99]:
df_test = pd.read_csv(r'..\..\data\general_datasets\test.csv', encoding = 'latin1', low_memory = False, nrows = 10000)

In [100]:
df = pd.concat([df_train, df_test], axis = 0, ignore_index = True)

In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510000 entries, 0 to 509999
Data columns (total 43 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   transaction_id                510000 non-null  uint64 
 1   is_fraud                      500000 non-null  float64
 2   created_at                    510000 non-null  object 
 3   is_subscription               510000 non-null  bool   
 4   transaction_type              510000 non-null  object 
 5   currency_amount               510000 non-null  float64
 6   currency_id                   510000 non-null  int64  
 7   amount_scaled                 510000 non-null  int64  
 8   merchant_customer_id          438631 non-null  object 
 9   merchant_customer_email       510000 non-null  object 
 10  merchant_customer_phone       15434 non-null   object 
 11  merchant_customer_first_name  72951 non-null   object 
 12  merchant_customer_last_name   52367 non-null

In [102]:
df.drop(columns = ['merchant_customer_phone', 'merchant_customer_first_name', 'transaction_source', 'traffic_source',
                   'merchant_customer_last_name', 'merchant_city', 'order_number', 'is_verified', 'browser',
                   'browser_version', 'operating_system', 'operating_system_version', 'device',
                   'card_holder_first_name', 'card_holder_last_name'], inplace = True)

In [103]:
df['created_at'] = pd.to_datetime(df['created_at'])
df['month'] = df['created_at'].dt.month
df['week_day'] = df['created_at'].dt.dayofweek
df.drop('created_at', axis=1, inplace=True)

In [104]:
df['card_exp_relative'] = df['card_exp_relative'].fillna(df['card_exp_relative'].median())

In [105]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510000 entries, 0 to 509999
Data columns (total 29 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   transaction_id           510000 non-null  uint64 
 1   is_fraud                 500000 non-null  float64
 2   is_subscription          510000 non-null  bool   
 3   transaction_type         510000 non-null  object 
 4   currency_amount          510000 non-null  float64
 5   currency_id              510000 non-null  int64  
 6   amount_scaled            510000 non-null  int64  
 7   merchant_customer_id     438631 non-null  object 
 8   merchant_customer_email  510000 non-null  object 
 9   merchant_country         510000 non-null  object 
 10  merchant_language        403779 non-null  object 
 11  ip_address               510000 non-null  object 
 12  platform                 510000 non-null  object 
 13  merchant_id              510000 non-null  uint64 
 14  merc

In [106]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510000 entries, 0 to 509999
Data columns (total 29 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   transaction_id           510000 non-null  uint64 
 1   is_fraud                 500000 non-null  float64
 2   is_subscription          510000 non-null  bool   
 3   transaction_type         510000 non-null  object 
 4   currency_amount          510000 non-null  float64
 5   currency_id              510000 non-null  int64  
 6   amount_scaled            510000 non-null  int64  
 7   merchant_customer_id     438631 non-null  object 
 8   merchant_customer_email  510000 non-null  object 
 9   merchant_country         510000 non-null  object 
 10  merchant_language        403779 non-null  object 
 11  ip_address               510000 non-null  object 
 12  platform                 510000 non-null  object 
 13  merchant_id              510000 non-null  uint64 
 14  merc

In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510000 entries, 0 to 509999
Data columns (total 29 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   transaction_id           510000 non-null  uint64 
 1   is_fraud                 500000 non-null  float64
 2   is_subscription          510000 non-null  bool   
 3   transaction_type         510000 non-null  object 
 4   currency_amount          510000 non-null  float64
 5   currency_id              510000 non-null  int64  
 6   amount_scaled            510000 non-null  int64  
 7   merchant_customer_id     438631 non-null  object 
 8   merchant_customer_email  510000 non-null  object 
 9   merchant_country         510000 non-null  object 
 10  merchant_language        403779 non-null  object 
 11  ip_address               510000 non-null  object 
 12  platform                 510000 non-null  object 
 13  merchant_id              510000 non-null  uint64 
 14  merc

In [108]:
num_cols = df.drop(['is_fraud', 'currency_id', 'merchant_shop_id'], axis=1).select_dtypes(include=["number"]).columns

for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = np.clip(df[col], lower_bound, upper_bound)

In [109]:
def parse_user_agent(user_agent):
    
    legacy_token = user_agent.split(" ")[0] if user_agent else None
    user_agent = user_agent.replace(legacy_token, '')
    os_match = re.search(r'\(([^;]+; [^\)]+)\)', user_agent)
    os_info = os_match.group(1) if os_match else None
    
    browser_match = re.search(r'(Mozilla|Chrome|Firefox|Edge|Opera)/([\d\.]+)', user_agent)
    actual_browser = browser_match.group(1) + '/' + browser_match.group(2) if browser_match else None
    
    engine_match = re.search(r'(AppleWebKit|Gecko|Trident)/([\d\.]+)', user_agent)
    rendering_engine = engine_match.group(1) + '/' + engine_match.group(2) if engine_match else None
    
    compatible_engine_match = re.search(r'(KHTML|Presto|Blink)', user_agent)
    compatible_engine = '(' + compatible_engine_match.group(1) + ', LIKE GECKO)' if compatible_engine_match else None
    
    compatible_browser_match = re.search(r'(Safari)/([\d\.]+)', user_agent)
    compatible_browser = compatible_browser_match.group(1) + '/' + compatible_browser_match.group(2) if compatible_browser_match else None
    
    return {
        "legacy_token": legacy_token,
        "operating_system": os_info,
        "browser_rendering_engine": rendering_engine,
        "compatible_rendering_engine": compatible_engine,
        "actual_browser": actual_browser,
        "compatible_browser": compatible_browser
    }

In [110]:
df['user_agent'] = df['user_agent'].fillna('#')
df_parsed = df[df['user_agent'] != '#']['user_agent'].apply(parse_user_agent).apply(pd.Series)
df = df.join(df_parsed)

In [111]:
df.drop('user_agent', axis = 1, inplace = True)

In [112]:
object_columns = df[0:1].select_dtypes(include = ['object']).columns.to_list()
for column in object_columns:
    df[column] = df[column].fillna('#')

In [113]:
df['merchant_language'] = df['merchant_language'].apply(lambda lang: lang.split('_')[0])

In [114]:
unbalanced_categories = ['merchant_country', 'merchant_language', 'ip_country', 'platform', 'cardbrand', 'cardcountry']
length = len(df)
for categ in unbalanced_categories:
    category_counts = df[categ].value_counts()
    df[categ] = df[categ].apply(lambda value: value if category_counts[value] > 0.001 * length else 'other')

In [115]:
for column in object_columns:
    encoder = LabelEncoder()
    df[column] = encoder.fit_transform(df[column])

In [116]:
bool_decoder = {False: 0, True: 1}
bool_columns = df[0:1].select_dtypes(include = ['bool']).columns.to_list()
for column in bool_columns:
    df[column] = df[column].map(bool_decoder)

In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510000 entries, 0 to 509999
Data columns (total 34 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   transaction_id               510000 non-null  uint64 
 1   is_fraud                     500000 non-null  float64
 2   is_subscription              510000 non-null  int64  
 3   transaction_type             510000 non-null  int32  
 4   currency_amount              510000 non-null  float64
 5   currency_id                  510000 non-null  int64  
 6   amount_scaled                510000 non-null  float64
 7   merchant_customer_id         510000 non-null  int32  
 8   merchant_customer_email      510000 non-null  int32  
 9   merchant_country             510000 non-null  int32  
 10  merchant_language            510000 non-null  int32  
 11  ip_address                   510000 non-null  int32  
 12  platform                     510000 non-null  int32  
 13 

In [118]:
length_train = len(df_train)
df.iloc[0: length_train, 1:].to_csv(r'preprocessed_train_data.csv', index = False)

In [119]:
df.iloc[length_train: ].to_csv(r'preprocessed_test_data.csv', index = False)