In [79]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [80]:
df = pd.read_csv(r'..\..\data\general_datasets\train.csv', encoding = 'latin1', low_memory = False, nrows = 500000)

In [81]:
df.drop(columns = ['transaction_id', 'merchant_customer_phone', 'merchant_customer_first_name',
                   'merchant_customer_last_name', 'merchant_city', 'order_number', 'is_verified', 'browser',
                   'browser_version', 'operating_system', 'operating_system_version', 'device',
                   'card_holder_first_name', 'card_holder_last_name'], inplace = True)
df['created_at'] = pd.to_datetime(df['created_at'])
df['month'] = df['created_at'].dt.month
df['week_day'] = df['created_at'].dt.dayofweek
df.drop('created_at', axis=1, inplace=True)
df.dropna(subset = ['card_exp_relative'], inplace = True)
df.fillna('#', inplace = True)
df['merchant_language'] = df['merchant_language'].apply(lambda lang: lang.split('_')[0])
unbalanced_categories = ['merchant_country', 'merchant_language', 'ip_country', 'platform', 'cardbrand', 'cardcountry']
length = len(df)
for categ in unbalanced_categories:
    category_counts = df[categ].value_counts()
    df[categ] = df[categ].apply(lambda value: value if category_counts[value] > 0.001*length else 'other')
    
num_cols = df.drop(['is_fraud', 'currency_id', 'merchant_shop_id'], axis=1).select_dtypes(include=["number"]).columns

for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = np.clip(df[col], lower_bound, upper_bound)

In [82]:
import re

def parse_user_agent(user_agent: str):
    # Шаблоны для извлечения информации
    legacy_token = user_agent.split(" ")[0] if user_agent else None
    user_agent = user_agent.replace(legacy_token, '')
    os_match = re.search(r'\(([^;]+; [^\)]+)\)', user_agent)
    os_info = os_match.group(1) if os_match else None
    
    browser_match = re.search(r'(Mozilla|Chrome|Firefox|Edge|Opera)/([\d\.]+)', user_agent)
    actual_browser = browser_match.group(1) + '/' + browser_match.group(2) if browser_match else None
    
    engine_match = re.search(r'(AppleWebKit|Gecko|Trident)/([\d\.]+)', user_agent)
    rendering_engine = engine_match.group(1) + '/' + engine_match.group(2) if engine_match else None
    
    compatible_engine_match = re.search(r'(KHTML|Presto|Blink)', user_agent)
    compatible_engine = '(' + compatible_engine_match.group(1) + ', LIKE GECKO)' if compatible_engine_match else None
    
    compatible_browser_match = re.search(r'(Safari)/([\d\.]+)', user_agent)
    compatible_browser = compatible_browser_match.group(1) + '/' + compatible_browser_match.group(2) if compatible_browser_match else None
    
    return {
        "legacy token": legacy_token,
        "operating system": os_info,
        "browser rendering engine": rendering_engine,
        "compatible rendering engine": compatible_engine,
        "actual browser": actual_browser,
        "compatible browser": compatible_browser
    }


In [83]:
df_parsed = df[df['user_agent'] != '#']['user_agent'].apply(parse_user_agent).apply(pd.Series)
df = df.join(df_parsed)

In [84]:
df.fillna('#', inplace = True)

In [85]:
df.drop('user_agent', axis = 1, inplace = True)

In [86]:
categ_features = df.select_dtypes(include = ['object']).columns.to_list()
for feat in categ_features:
    encoder = LabelEncoder()
    df[feat] = encoder.fit_transform(df[feat])

In [87]:
df.to_csv('preprocessed_data_mini.csv', index = False)

In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 499982 entries, 0 to 499999
Data columns (total 35 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   is_fraud                     499982 non-null  int64  
 1   is_subscription              499982 non-null  bool   
 2   transaction_type             499982 non-null  int32  
 3   currency_amount              499982 non-null  float64
 4   currency_id                  499982 non-null  int64  
 5   amount_scaled                499982 non-null  int64  
 6   merchant_customer_id         499982 non-null  int32  
 7   merchant_customer_email      499982 non-null  int32  
 8   merchant_country             499982 non-null  int32  
 9   merchant_language            499982 non-null  int32  
 10  ip_address                   499982 non-null  int32  
 11  platform                     499982 non-null  int32  
 12  merchant_id                  499982 non-null  uint64 
 13  merc

In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 499982 entries, 0 to 499999
Data columns (total 35 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   is_fraud                     499982 non-null  int64  
 1   is_subscription              499982 non-null  bool   
 2   transaction_type             499982 non-null  int32  
 3   currency_amount              499982 non-null  float64
 4   currency_id                  499982 non-null  int64  
 5   amount_scaled                499982 non-null  int64  
 6   merchant_customer_id         499982 non-null  int32  
 7   merchant_customer_email      499982 non-null  int32  
 8   merchant_country             499982 non-null  int32  
 9   merchant_language            499982 non-null  int32  
 10  ip_address                   499982 non-null  int32  
 11  platform                     499982 non-null  int32  
 12  merchant_id                  499982 non-null  uint64 
 13  merc