In [1]:
import pandas as pd
import numpy as np
import gdown

# Загрузка и чтение данных

Подгрузим данные с гугл диска. Данные содержат часть исходных данных (релевантную случайную выборку) для эксперементирования и построения базовых моделей.

Файл с данными о курсе валют

In [2]:
file_id = "1TIDBDewZasdjdeKQ9KC8hgji55WbY7jS"  # Замените на ваш ID
url = f"https://drive.google.com/uc?id={file_id}"
output = "currency.csv"  # Имя файла после скачивания

gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1TIDBDewZasdjdeKQ9KC8hgji55WbY7jS
To: /content/currency.csv
100%|██████████| 4.17k/4.17k [00:00<00:00, 6.97MB/s]


'currency.csv'

Файл с выборкой исходных данных о транзакциях

In [3]:
file_id = "1CW7Eax-0nJZImy9V4xqUziD-82jOHWTP"
url = f"https://drive.google.com/uc?id={file_id}"
output = "df_sample.csv"
gdown.download(url, output, quiet=False)


Downloading...
From: https://drive.google.com/uc?id=1CW7Eax-0nJZImy9V4xqUziD-82jOHWTP
To: /content/df_sample.csv
100%|██████████| 10.8M/10.8M [00:00<00:00, 48.3MB/s]


'df_sample.csv'

In [4]:
df = pd.read_csv('df_sample.csv')

In [5]:
df_currency = pd.read_csv('currency.csv')

# Подготовка данных

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [7]:
# ненужные колонки
drop_cols = [
    "Unnamed: 0",
    "transaction_id",
    "customer_id",
    "card_number",
    "device_fingerprint",
    "ip_address",
    "date",  # будем использовать timestamp
]
df = df.drop(columns=drop_cols)

In [8]:
df

Unnamed: 0,timestamp,vendor_category,vendor_type,vendor,amount,currency,country,city,city_size,card_type,is_card_present,device,channel,is_outside_home_country,is_high_risk_vendor,is_weekend,is_fraud,amount_rub,amount_dollars
0,2024-09-30 00:02:26.629727,Travel,booking,Booking.com,9501.29,EUR,France,Unknown City,medium,Premium Debit,False,Edge,web,True,True,False,True,998661.360322,10608.963498
1,2024-09-30 00:05:14.816250,Education,supplies,University Bookstore,15.49,BRL,Brazil,Unknown City,medium,Basic Credit,True,NFC Payment,pos,False,False,False,True,268.302819,2.850230
2,2024-09-30 00:06:34.909762,Travel,transport,Enterprise Rent-A-Car,153.91,USD,USA,San Jose,medium,Gold Credit,False,Edge,web,True,True,False,False,14488.123180,153.910000
3,2024-09-30 00:06:46.866084,Restaurant,fast_food,Burger King,4345.22,JPY,Japan,Unknown City,medium,Basic Debit,False,iOS App,mobile,False,False,False,False,2868.923429,30.477102
4,2024-09-30 00:11:10.603389,Grocery,online,Instacart,539.43,SGD,Singapore,Unknown City,medium,Gold Credit,False,Android App,mobile,False,False,False,False,39665.918171,421.378352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37472,2024-10-30 23:54:02.824651,Entertainment,events,LiveNation,137795.50,RUB,Russia,Unknown City,medium,Platinum Credit,False,Android App,mobile,True,True,False,True,137795.500000,1413.265979
37473,2024-10-30 23:55:05.639997,Gas,local,Truck Stop,528.66,CAD,Canada,Unknown City,medium,Premium Debit,False,Edge,web,False,False,False,False,37057.108570,380.067207
37474,2024-10-30 23:55:21.516052,Education,supplies,Chegg,1432.63,MXN,Mexico,Unknown City,medium,Basic Debit,False,Android App,mobile,False,False,False,False,6967.191346,71.457301
37475,2024-10-30 23:58:14.723787,Grocery,online,Amazon Fresh,306.98,AUD,Australia,Unknown City,medium,Basic Debit,False,Android App,mobile,False,False,False,False,19662.617302,201.664845


In [9]:
# Преобразуем timestamp в datetime + создадим новые признаки
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["hour"] = df["timestamp"].dt.hour
df["day_of_week"] = df["timestamp"].dt.dayofweek
df = df.drop(columns=["timestamp"])

In [10]:
df.columns

Index(['vendor_category', 'vendor_type', 'vendor', 'amount', 'currency',
       'country', 'city', 'city_size', 'card_type', 'is_card_present',
       'device', 'channel', 'is_outside_home_country', 'is_high_risk_vendor',
       'is_weekend', 'is_fraud', 'amount_rub', 'amount_dollars', 'hour',
       'day_of_week'],
      dtype='object')

In [11]:
# Целевая переменная
y = df["is_fraud"].astype(int)  # переводим bool → int
X = df.drop(columns=["is_fraud"])

# Определим типы признаков
num_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_features = X.select_dtypes(include=["object", "bool"]).columns.tolist()

# Кодируем булевы в int (чтобы их можно было нормализовать или закодировать)
for col in cat_features:
    if X[col].dtype == bool:
        X[col] = X[col].astype(int)

In [12]:
# обновим списки
num_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_features = X.select_dtypes(include=["object"]).columns.tolist()

# пайплайн для преобразования
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features)
    ]
)

In [13]:
df

Unnamed: 0,vendor_category,vendor_type,vendor,amount,currency,country,city,city_size,card_type,is_card_present,device,channel,is_outside_home_country,is_high_risk_vendor,is_weekend,is_fraud,amount_rub,amount_dollars,hour,day_of_week
0,Travel,booking,Booking.com,9501.29,EUR,France,Unknown City,medium,Premium Debit,False,Edge,web,True,True,False,True,998661.360322,10608.963498,0,0
1,Education,supplies,University Bookstore,15.49,BRL,Brazil,Unknown City,medium,Basic Credit,True,NFC Payment,pos,False,False,False,True,268.302819,2.850230,0,0
2,Travel,transport,Enterprise Rent-A-Car,153.91,USD,USA,San Jose,medium,Gold Credit,False,Edge,web,True,True,False,False,14488.123180,153.910000,0,0
3,Restaurant,fast_food,Burger King,4345.22,JPY,Japan,Unknown City,medium,Basic Debit,False,iOS App,mobile,False,False,False,False,2868.923429,30.477102,0,0
4,Grocery,online,Instacart,539.43,SGD,Singapore,Unknown City,medium,Gold Credit,False,Android App,mobile,False,False,False,False,39665.918171,421.378352,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37472,Entertainment,events,LiveNation,137795.50,RUB,Russia,Unknown City,medium,Platinum Credit,False,Android App,mobile,True,True,False,True,137795.500000,1413.265979,23,2
37473,Gas,local,Truck Stop,528.66,CAD,Canada,Unknown City,medium,Premium Debit,False,Edge,web,False,False,False,False,37057.108570,380.067207,23,2
37474,Education,supplies,Chegg,1432.63,MXN,Mexico,Unknown City,medium,Basic Debit,False,Android App,mobile,False,False,False,False,6967.191346,71.457301,23,2
37475,Grocery,online,Amazon Fresh,306.98,AUD,Australia,Unknown City,medium,Basic Debit,False,Android App,mobile,False,False,False,False,19662.617302,201.664845,23,2


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37477 entries, 0 to 37476
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   vendor_category          37477 non-null  object 
 1   vendor_type              37477 non-null  object 
 2   vendor                   37477 non-null  object 
 3   amount                   37477 non-null  float64
 4   currency                 37477 non-null  object 
 5   country                  37477 non-null  object 
 6   city                     37477 non-null  object 
 7   city_size                37477 non-null  object 
 8   card_type                37477 non-null  object 
 9   is_card_present          37477 non-null  bool   
 10  device                   37477 non-null  object 
 11  channel                  37477 non-null  object 
 12  is_outside_home_country  37477 non-null  bool   
 13  is_high_risk_vendor      37477 non-null  bool   
 14  is_weekend            

In [15]:
bool_cols = df.select_dtypes(include=["bool"]).columns
df[bool_cols] = df[bool_cols].astype(int)

# OneHot для строковых колонок
cat_cols = df.select_dtypes(include=["object"]).columns

encoder = OneHotEncoder(drop=None, sparse_output=False, handle_unknown="ignore")
encoded_array = encoder.fit_transform(df[cat_cols])

# Создаём DataFrame с новыми колонками
encoded_df = pd.DataFrame(
    encoded_array,
    columns=encoder.get_feature_names_out(cat_cols),
    index=df.index
)

# Заменяем: удаляем старые строковые и добавляем новые
df.drop(columns=cat_cols, inplace=True)
df[encoded_df.columns] = encoded_df

  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded_df.columns] = encoded_df
  df[encoded

In [16]:
df

Unnamed: 0,amount,is_card_present,is_outside_home_country,is_high_risk_vendor,is_weekend,is_fraud,amount_rub,amount_dollars,hour,day_of_week,...,device_Chrome,device_Edge,device_Firefox,device_Magnetic Stripe,device_NFC Payment,device_Safari,device_iOS App,channel_mobile,channel_pos,channel_web
0,9501.29,0,1,1,0,1,998661.360322,10608.963498,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,15.49,1,0,0,0,1,268.302819,2.850230,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,153.91,0,1,1,0,0,14488.123180,153.910000,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4345.22,0,0,0,0,0,2868.923429,30.477102,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,539.43,0,0,0,0,0,39665.918171,421.378352,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37472,137795.50,0,1,1,0,1,137795.500000,1413.265979,23,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
37473,528.66,0,0,0,0,0,37057.108570,380.067207,23,2,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
37474,1432.63,0,0,0,0,0,6967.191346,71.457301,23,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
37475,306.98,0,0,0,0,0,19662.617302,201.664845,23,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [17]:
df.to_csv('df_for_ml')