# Техническое задание Маркетинг

 Интернет-магазин собирает историю покупателей, проводит рассылки предложений и 
планирует будущие продажи. Для оптимизации процессов надо выделить пользователей, 
которые готовы совершить покупку в ближайшее время.

## Подключение библиотек

In [1]:
!pip install phik
!pip install optuna
!pip install optuna-integration[sklearn]



In [2]:
import json

import pandas as pd
import phik

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import TargetEncoder

from sklearn.model_selection import train_test_split

from sklearn.dummy import DummyClassifier
from sklearn.ensemble import GradientBoostingClassifier

from optuna.integration import OptunaSearchCV
from optuna import distributions

from sklearn.metrics import roc_auc_score

## Загрузка данных

In [3]:
purchases_data = pd.read_csv('filtered_data/apparel-purchases.csv')
purchases_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202208 entries, 0 to 202207
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   client_id     202208 non-null  int64  
 1   quantity      202208 non-null  int64  
 2   price         202208 non-null  float64
 3   category_ids  202208 non-null  object 
 4   date          202208 non-null  object 
 5   message_id    202208 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 9.3+ MB


In [4]:
target_data = pd.read_csv('filtered_data/apparel-target_binary.csv')
target_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49849 entries, 0 to 49848
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   client_id  49849 non-null  int64
 1   target     49849 non-null  int64
dtypes: int64(2)
memory usage: 779.0 KB


In [5]:
message_data = pd.read_csv('filtered_data/apparel-messages.csv')
message_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12739798 entries, 0 to 12739797
Data columns (total 7 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   bulk_campaign_id  int64 
 1   client_id         int64 
 2   message_id        object
 3   event             object
 4   channel           object
 5   date              object
 6   created_at        object
dtypes: int64(2), object(5)
memory usage: 680.4+ MB


## Слияние данных

Необходимо соедининить таблицы с продажами и с рассылками

In [6]:
full_data = purchases_data.merge(target_data, on=['client_id'])
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202208 entries, 0 to 202207
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   client_id     202208 non-null  int64  
 1   quantity      202208 non-null  int64  
 2   price         202208 non-null  float64
 3   category_ids  202208 non-null  object 
 4   date          202208 non-null  object 
 5   message_id    202208 non-null  object 
 6   target        202208 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 10.8+ MB


In [7]:
full_data = full_data.merge(message_data, how='left', on=['client_id', 'message_id'], suffixes=('_purchase', '_message'))
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 862248 entries, 0 to 862247
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   client_id         862248 non-null  int64  
 1   quantity          862248 non-null  int64  
 2   price             862248 non-null  float64
 3   category_ids      862248 non-null  object 
 4   date_purchase     862248 non-null  object 
 5   message_id        862248 non-null  object 
 6   target            862248 non-null  int64  
 7   bulk_campaign_id  805820 non-null  float64
 8   event             805820 non-null  object 
 9   channel           805820 non-null  object 
 10  date_message      805820 non-null  object 
 11  created_at        805820 non-null  object 
dtypes: float64(2), int64(3), object(7)
memory usage: 78.9+ MB


Далее следует посчитать промежуток между оповещением и покупкой

In [8]:
full_data['date_purchase'] = pd.to_datetime(full_data['date_purchase'])
full_data['date_message'] = pd.to_datetime(full_data['date_message'])
full_data['timedelta'] = (full_data['date_purchase'] - full_data['date_message']).dt.days.fillna(0)
full_data['timedelta'].describe()

count    862248.000000
mean          0.979731
std          22.967414
min        -591.000000
25%           0.000000
50%           0.000000
75%           0.000000
max         519.000000
Name: timedelta, dtype: float64

In [9]:
(full_data['timedelta'] < 0).value_counts()

timedelta
False    822448
True      39800
Name: count, dtype: int64

Имеются покупки, сделанные независимо от оповещения, их меньше 10% - можно удалить, потому что эти покупки были сделаны независимо от рекламы

In [10]:
full_data = full_data[full_data['timedelta'] >= 0]
full_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 822448 entries, 0 to 862247
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   client_id         822448 non-null  int64         
 1   quantity          822448 non-null  int64         
 2   price             822448 non-null  float64       
 3   category_ids      822448 non-null  object        
 4   date_purchase     822448 non-null  datetime64[ns]
 5   message_id        822448 non-null  object        
 6   target            822448 non-null  int64         
 7   bulk_campaign_id  766020 non-null  float64       
 8   event             766020 non-null  object        
 9   channel           766020 non-null  object        
 10  date_message      766020 non-null  datetime64[ns]
 11  created_at        766020 non-null  object        
 12  timedelta         822448 non-null  float64       
dtypes: datetime64[ns](2), float64(3), int64(3), object(5)
memory usa

In [11]:
len(full_data['category_ids'].unique())

933

In [12]:
category_sets = []

def add_category_in_arr(category: str) -> None:
    new_set = set(eval(category))
    for category_set in category_sets:
        if new_set.issubset(category_set):
            return
    category_sets.append(new_set)

def update_category(category: str) -> str:
    new_set = set(eval(category))
    for category_set in category_sets:
        if new_set.issubset(category_set):
            new_set = category_set
    return str(list(new_set))

full_data['category_ids'].apply(add_category_in_arr)
full_data['category_ids'] = full_data['category_ids'].apply(update_category)
full_data['category_ids'].unique()

array(["['4', '28', '431', '57']", "['258', '2', '1569', '18']",
       "['4', '28', '124', '1602']", "['4', '28', '657', '62']",
       "['4', '28', '244', '432']", "['4', '28', '49', '413']",
       "['441', '258', '2', '18']", "['243', '2', '18', '440']",
       "['2', '18', '341', '1311']", "['667', '5549', '5562', '5631']",
       "['678', '5649', '5630', '5562']", "['4', '28', '44', '490']",
       "['443', '267', '2', '18']", "['4', '28', '417', '213']",
       "['5584', '1786', '5589', '5562']",
       "['5584', '5589', '5562', '1795']",
       "['550', '5699', '5630', '5562']",
       "['1006', '5665', '5683', '5562']", "['445', '2', '344', '18']",
       "['2', '451', '18', '123']", "['17', '330', '387', '2', '467']",
       "['4', '840', '27', '353']", "['1314', '5683', '5562', '5572']",
       "['4', '28', '343', '425']", "['4', '31', '505', '326']",
       "['4', '28', '260', '420']", "['5622', '560', '5562', '5631']",
       "['608', '5549', '5562', '5631']", "['290', '4'

In [13]:
len(full_data['category_ids'].unique())

916

In [14]:
filtered_data = full_data.drop(
    columns=[
        'client_id', 'message_id', 'bulk_campaign_id',
        'date_message', 'date_purchase', 'created_at',
    ]
)
filtered_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 822448 entries, 0 to 862247
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   quantity      822448 non-null  int64  
 1   price         822448 non-null  float64
 2   category_ids  822448 non-null  object 
 3   target        822448 non-null  int64  
 4   event         766020 non-null  object 
 5   channel       766020 non-null  object 
 6   timedelta     822448 non-null  float64
dtypes: float64(2), int64(2), object(3)
memory usage: 50.2+ MB


In [15]:
filtered_data.describe()

Unnamed: 0,quantity,price,target,timedelta
count,822448.0,822448.0,822448.0,822448.0
mean,1.007106,1201.176505,0.036661,2.003203
std,0.174767,1367.690221,0.187929,19.106953
min,1.0,1.0,0.0,0.0
25%,1.0,384.0,0.0,0.0
50%,1.0,999.0,0.0,0.0
75%,1.0,1699.0,0.0,1.0
max,30.0,85499.0,1.0,519.0


In [16]:
filtered_data.phik_matrix(
    interval_cols=['quantity', 'price', 'timedelta']
)

Unnamed: 0,quantity,price,category_ids,target,event,channel,timedelta
quantity,1.0,0.0,0.222585,0.001991,0.0,0.009131,0.0
price,0.0,1.0,0.725513,0.008114,0.002,0.029799,0.0
category_ids,0.222585,0.725513,1.0,0.227324,0.176569,0.424098,0.38539
target,0.001991,0.008114,0.227324,1.0,0.016302,0.039506,0.018502
event,0.0,0.002,0.176569,0.016302,1.0,0.310695,0.05581
channel,0.009131,0.029799,0.424098,0.039506,0.310695,1.0,0.077541
timedelta,0.0,0.0,0.38539,0.018502,0.05581,0.077541,1.0


In [17]:
encode_columns = [
    'event', 'channel', 'category_ids',
]

interval_columns = [
    'quantity', 'price', 'timedelta'
]

data_preprocessor = ColumnTransformer([
    ('encode', TargetEncoder(), encode_columns),
    ('num', MinMaxScaler(), interval_columns),
], remainder='passthrough')

In [18]:
RANDOM_STATE = 42

X = filtered_data.drop(
    columns=['target']
)
y = filtered_data['target']

X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 822448 entries, 0 to 862247
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   quantity      822448 non-null  int64  
 1   price         822448 non-null  float64
 2   category_ids  822448 non-null  object 
 3   event         766020 non-null  object 
 4   channel       766020 non-null  object 
 5   timedelta     822448 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 43.9+ MB


In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y,
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    test_size=0.25,
    random_state=RANDOM_STATE,
    stratify=y_train,
)

In [20]:
pipeline_classifier = Pipeline([
    ('preprocessor', data_preprocessor),
    ('model', GradientBoostingClassifier())
])

params = {
    'model__loss': distributions.CategoricalDistribution(('log_loss', 'exponential')),
    'model__max_depth': distributions.IntDistribution(2, 10),
    'model__min_samples_split': distributions.IntDistribution(2, 10),
    'model__min_samples_leaf': distributions.IntDistribution(2, 10),
    'model__criterion': distributions.CategoricalDistribution(('friedman_mse', 'squared_error')),
    'preprocessor__num': distributions.CategoricalDistribution((StandardScaler(), MinMaxScaler(), 'passthrough')),
}



In [21]:
classifier_search = OptunaSearchCV(
    pipeline_classifier,
    params,
    cv=5,
    scoring='roc_auc',
    random_state=RANDOM_STATE,
)

  classifier_search = OptunaSearchCV(


In [22]:
classifier_search.fit(X_train, y_train)

print(f'Лучшая модель и ее параметры: {classifier_search.best_estimator_}')

[I 2025-02-16 14:05:01,762] A new study created in memory with name: no-name-4dbb9660-b2e7-49f5-944d-0d8abc907868
[I 2025-02-16 14:08:20,261] Trial 0 finished with value: 0.7167168883786229 and parameters: {'model__loss': 'exponential', 'model__max_depth': 2, 'model__min_samples_split': 4, 'model__min_samples_leaf': 6, 'model__criterion': 'friedman_mse', 'preprocessor__num': 'passthrough'}. Best is trial 0 with value: 0.7167168883786229.
[I 2025-02-16 14:14:56,795] Trial 1 finished with value: 0.7592794904899082 and parameters: {'model__loss': 'exponential', 'model__max_depth': 6, 'model__min_samples_split': 4, 'model__min_samples_leaf': 5, 'model__criterion': 'squared_error', 'preprocessor__num': StandardScaler()}. Best is trial 1 with value: 0.7592794904899082.
[I 2025-02-16 14:25:05,558] Trial 2 finished with value: 0.7911426629959971 and parameters: {'model__loss': 'exponential', 'model__max_depth': 9, 'model__min_samples_split': 2, 'model__min_samples_leaf': 6, 'model__criterion':

Лучшая модель и ее параметры: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('encode', TargetEncoder(),
                                                  ['event', 'channel',
                                                   'category_ids']),
                                                 ('num', 'passthrough',
                                                  ['quantity', 'price',
                                                   'timedelta'])])),
                ('model',
                 GradientBoostingClassifier(loss='exponential', max_depth=10,
                                            min_samples_leaf=3,
                                            min_samples_split=10))])


In [23]:
predictions = classifier_search.best_estimator_.predict_proba(X_val)[:,1]
print(f'roc_auc лучшей модели на валидационной выборке: {roc_auc_score(y_val, predictions)}')

roc_auc лучшей модели на валидационной выборке: 0.8043565443390801


In [24]:
pipeline_dummy_classifier = Pipeline([
    ('preprocessor', data_preprocessor),
    ('model', DummyClassifier())
])

params = {
    'preprocessor__num': distributions.CategoricalDistribution((StandardScaler(), MinMaxScaler(), 'passthrough')),
}



In [25]:
dummy_classifier_search = OptunaSearchCV(
    pipeline_dummy_classifier,
    params,
    cv=5,
    scoring='roc_auc',
    random_state=RANDOM_STATE,
)

  dummy_classifier_search = OptunaSearchCV(


In [26]:
dummy_classifier_search.fit(X_train, y_train)
print(f'Лучшая модель и ее параметры: {dummy_classifier_search.best_estimator_}')

[I 2025-02-16 15:14:58,709] A new study created in memory with name: no-name-50e8deaf-5df0-4c3e-92ba-e76ccba83bc9
[I 2025-02-16 15:15:04,428] Trial 0 finished with value: 0.5 and parameters: {'preprocessor__num': MinMaxScaler()}. Best is trial 0 with value: 0.5.
[I 2025-02-16 15:15:09,819] Trial 1 finished with value: 0.5 and parameters: {'preprocessor__num': MinMaxScaler()}. Best is trial 0 with value: 0.5.
[I 2025-02-16 15:15:15,401] Trial 2 finished with value: 0.5 and parameters: {'preprocessor__num': StandardScaler()}. Best is trial 0 with value: 0.5.
[I 2025-02-16 15:15:20,469] Trial 3 finished with value: 0.5 and parameters: {'preprocessor__num': 'passthrough'}. Best is trial 0 with value: 0.5.
[I 2025-02-16 15:15:25,848] Trial 4 finished with value: 0.5 and parameters: {'preprocessor__num': StandardScaler()}. Best is trial 0 with value: 0.5.
[I 2025-02-16 15:15:31,126] Trial 5 finished with value: 0.5 and parameters: {'preprocessor__num': 'passthrough'}. Best is trial 0 with va

Лучшая модель и ее параметры: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('encode', TargetEncoder(),
                                                  ['event', 'channel',
                                                   'category_ids']),
                                                 ('num', MinMaxScaler(),
                                                  ['quantity', 'price',
                                                   'timedelta'])])),
                ('model', DummyClassifier())])


In [27]:
predictions = dummy_classifier_search.best_estimator_.predict_proba(X_val)[:,1]
print(f'roc_auc лучшей модели на валидационной выборке: {roc_auc_score(y_val, predictions)}')

roc_auc лучшей модели на валидационной выборке: 0.5


In [28]:
predictions = dummy_classifier_search.best_estimator_.predict_proba(X_test)[:,1]
print(f'roc_auc лучшей модели на тестовой выборке: {roc_auc_score(y_test, predictions)}')

roc_auc лучшей модели на тестовой выборке: 0.5


Модель себя показала лучше Dummy-модели

## Тестирование модели

In [29]:
predictions = classifier_search.best_estimator_.predict_proba(X_test)[:,1]
print(f'roc_auc лучшей модели на тестовой выборке: {roc_auc_score(y_test, predictions)}')

roc_auc лучшей модели на тестовой выборке: 0.8062216338476744


## Отбор потенциальных клиентов

In [30]:
results = classifier_search.best_estimator_.predict(full_data)
full_data['result'] = results

In [31]:
result_data = full_data[full_data['result'] == 1]
result_data['client_id'].unique()

array([1515915625489048744, 1515915625491214396, 1515915625491750681,
       1515915625557821791, 1515915625479406520, 1515915625491226228,
       1515915625500742715, 1515915625580957556, 1515915625592099647,
       1515915625468108882, 1515915625468130156, 1515915625468173924,
       1515915625490110609, 1515915625589704898, 1515915625579469689,
       1515915625502985040, 1515915625473533863, 1515915625471192820,
       1515915625489371358, 1515915625490334876, 1515915625490737665,
       1515915625500551889, 1515915625536060309, 1515915625489130836,
       1515915625502215189, 1515915625490038596, 1515915625475255066,
       1515915625468078191, 1515915625764103190, 1515915625577756131,
       1515915625489021270, 1515915625508189362, 1515915625470860261,
       1515915625746462163, 1515915625490580778, 1515915625471373850,
       1515915625487445691, 1515915625500232103, 1515915625472764618,
       1515915625503565704, 1515915625488505627, 1515915625503481786,
       1515915625551

Клиенты для рассылки выбраны