In [1]:
# https://pf.mgcom.ru/task/1433882

In [2]:
# 1 действие
import pandas as pd
import numpy as np
import json
import os
from datetime import datetime, timedelta
from decimal import Decimal

In [16]:
#4 действие
START_DATE = '2024-05-01'
END_DATE = '2024-05-06'

# BZ Files
PATH_AM_BZ_EVENTS = r'D:\Work\Rigla__Bud_Zdorov\2024-05-07\AM\BZ\Events\AM_bz_events_01.04-06.05.csv'
PATH_AM_BZ_INSTALLS = r'D:\Work\Rigla__Bud_Zdorov\2024-05-07\AM\BZ\Installs\AM_bz_installations_01.04-06.05.csv'


# Rigla Files
PATH_AM_RIGLA_EVENTS = r'D:\Work\Rigla__Bud_Zdorov\2024-05-07\AM\Rigla\Events\AM_rigla_events_01.04-06.05.csv'
PATH_AM_RIGLA_INSTALLS = r'D:\Work\Rigla__Bud_Zdorov\2024-05-07\AM\Rigla\Installs\AM_rigla_installations_01.04-06.05.csv'

#OUTPUT
PATH_OUTPUT = r'D:\Work\Rigla__Bud_Zdorov\2024-09-12\Output'

In [17]:
# 2 действие
def save_df_to_csv(df, dir_to_save, name, separate_by='\t', dec=','):
    ts = datetime.now().strftime("%Y-%m-%d_%H%M%S")
    file_name = ts + '_' + name + '.csv'
    if not os.path.exists(dir_to_save):
        os.makedirs(dir_to_save)
    csv_path = os.path.join(os.path.normpath(dir_to_save), file_name)
    df.to_csv(csv_path, index=False, sep=separate_by, decimal=dec)
    print(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}: dataFrame saved to {csv_path}")
    return csv_path

def print_ts(message):
    print("{ts}: {message}".format(ts=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), message=message))
    return

# Получаем истоники для каждого уникального appmetrica_device_id
def get_install_sources(df):
    # Приводим время по формату (2024-02-01 00:18:25)
    df['install_datetime'] = pd.to_datetime(df['install_datetime'], format='%Y-%m-%d %H:%M:%S')    
    
    # Берём первый заказ за месяц
    group_cols = ['brand', 'publisher_name', 'tracker_name', 'appmetrica_device_id']
    filter_df = df.groupby(group_cols, as_index=False, dropna=False).agg({'install_datetime':['min']})
    filter_df.columns = group_cols + ['install_datetime']
    xdf = df.merge(filter_df, on=list(filter_df.columns), how='inner')
    xdf = xdf[list(filter_df.columns) + ['is_reinstallation']]
    xdf.rename(columns={'brand':'brand_install'}, inplace=True)
  

    # Добавляем информацию об органике
    xdf['publisher_name'].fillna('organic', inplace=True)  
    return xdf

# Раскрываем поле с JSON по столбцам
def expand_json(xdf, json_field):
    df = xdf.copy(deep=True)
    df.reset_index(inplace=True, drop=True)
    df_json = pd.json_normalize(df[json_field].apply(lambda x: json.loads(x)))
    app_events_data = df.join(df_json)    
    return app_events_data


# Считываем данные, удаляем дубликаты и присваиваем бренд
def get_csv_raw_data(csv_file, sep=';', brand=None):
    print_ts('Loading CSV - ' + csv_file)  
    df = pd.read_csv(csv_file, sep=sep, encoding='utf8', dtype=object)
    df.drop_duplicates(inplace=True)    
    df['file'] = csv_file
    df['brand'] = brand    
    print_ts('Done!')
    return df

# Фильтруем данные по верменным рамкам
def filter_data_to_date_bounds(xdf, event_dt_column, start_date, end_date):
    print_ts('Checking dates in {0}'.format(event_dt_column))
    df = xdf.copy()
    df[event_dt_column] = pd.to_datetime(df[event_dt_column], format='%Y-%m-%d %H:%M:%S')
    df_events_unfiltered = df.shape[0]
    print_ts('Field:{0}, {1} events from {2} to {3}'.format(event_dt_column, 
                                                            df_events_unfiltered, 
                                                            df[event_dt_column].min(), 
                                                            df[event_dt_column].max()))

    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d") + timedelta(days=1) 
    df = df[(df[event_dt_column]>=start) & (df[event_dt_column]<end)]

    print_ts('Filtering...')    
    print_ts('Field:{0}, {1} events from {2} to {3}'.format(event_dt_column, 
                                                            df.shape[0], 
                                                            df[event_dt_column].min(), 
                                                            df[event_dt_column].max()))
    print_ts('{0} events filtered'.format(df_events_unfiltered - df.shape[0]))
    df.reset_index(drop=True, inplace=True)
    return df

# Получаем данные AppMetrica
def get_am_data(bz_events, bz_installs, rigla_events, rigla_installs):
    # Считываем данные
    am_events_bz = get_csv_raw_data(bz_events, sep=',', brand = 'budzdorov.ru')
    am_installs_bz = get_csv_raw_data(bz_installs, sep=',', brand = 'budzdorov.ru')
    am_events_rigla = get_csv_raw_data(rigla_events, sep=',', brand = 'rigla.ru')
    am_installs_rigla = get_csv_raw_data(rigla_installs, sep=',', brand = 'rigla.ru')

    # Фильтруем данные по верменным рамкам
    am_events_bz = filter_data_to_date_bounds(am_events_bz, 'event_datetime', START_DATE, END_DATE)
    am_installs_bz = filter_data_to_date_bounds(am_installs_bz, 'install_datetime', START_DATE, END_DATE)
    am_events_rigla = filter_data_to_date_bounds(am_events_rigla, 'event_datetime', START_DATE, END_DATE)
    am_installs_rigla = filter_data_to_date_bounds(am_installs_rigla, 'install_datetime', START_DATE, END_DATE)
    
    # Получаем истоники для каждого уникального appmetrica_device_id
    print_ts('Получаем истоники для каждого уникального appmetrica_device_id')
    bz_install_sources = get_install_sources(am_installs_bz)
    rigla_install_sources = get_install_sources(am_installs_rigla)
    
    # Получаем значения из JSON
    print_ts('Получаем значения из JSON')
    bz_json = expand_json(am_events_bz, 'event_json')
    rigla_json = expand_json(am_events_rigla, 'event_json')
    
    # Добавляем паблишеров
    print_ts('Добавляем паблишеров')
    bz_publishers = bz_json.merge(bz_install_sources, how='left', on='appmetrica_device_id')
    rigla_publishers = rigla_json.merge(rigla_install_sources, how='left', on='appmetrica_device_id')
    
    # Объединяем данные
    print_ts('Объединяем данные')
    df = pd.concat([bz_publishers, rigla_publishers]).reset_index(drop=True)
    df['publisher_name'].fillna('PUBLISHER_UNKNOWN', inplace=True)
    
    df['event_datetime'] = pd.to_datetime(df['event_datetime'], format='%Y-%m-%d %H:%M:%S')
    df['event_date'] = pd.to_datetime(df['event_datetime'].dt.date) 
    df['event_month'] = df['event_datetime'].dt.to_period('M')
    return df

# Получаем заказы AM сгруппированные только по минимальной дате заказа в месяце
# Вариант, который счиаем корректным
def get_am_orders_min_event_datetime(df_orders):
    df = df_orders.copy(deep=True)
    
    # Группируем по бренду, месяцу и устройству
    suitable_orders = df.groupby([
        'brand',
        'event_month',
        'appmetrica_device_id'], as_index=False, dropna=False).agg({
        'event_datetime':'min'
    })
    suitable_orders['is_suitable_order_in_month'] = True   
    # Добавляем признак подходящего заказа для исходного DF (для возможности последующей проверки)
    df = df.merge(suitable_orders, how='left', on=[
        'brand',
        'event_month',
        'appmetrica_device_id',
        'event_datetime'
    ])    
    df['is_suitable_order_in_month'].fillna(False, inplace=True)
    
    # Берём только подходящие заказы
    xdf = df[df['is_suitable_order_in_month']]
    xdf.reset_index(inplace=True, drop=True)
    return xdf

# Создаём структуру папок для удобства
def util_create_dirs_adhoc(root):
    am_bz_events_folder = os.path.join(root, 'AM','BZ','Events')
    am_bz_installs_folder = os.path.join(root, 'AM','BZ','Installs')
    am_rigla_events_folder = os.path.join(root, 'AM','Rigla','Events')
    am_rigla_installs_folder = os.path.join(root, 'AM','Rigla','Installs')
    folders = [am_bz_events_folder, am_bz_installs_folder, am_rigla_events_folder, am_rigla_installs_folder]
    for folder in folders:
        if not os.path.exists(folder):
            print_ts(f'Creating {folder}')
            os.makedirs(folder)
    return

In [18]:
# 3 действие
util_create_dirs_adhoc(r'C:\Rigla_BZ\2024 June\2024-06-09')

In [19]:
# 5 действие Получаем данные AM и тд
am_data = get_am_data(PATH_AM_BZ_EVENTS, PATH_AM_BZ_INSTALLS, PATH_AM_RIGLA_EVENTS, PATH_AM_RIGLA_INSTALLS)

2024-09-12 16:46:01: Loading CSV - D:\Work\Rigla__Bud_Zdorov\2024-05-07\AM\BZ\Events\AM_bz_events_01.04-06.05.csv
2024-09-12 16:46:12: Done!
2024-09-12 16:46:12: Loading CSV - D:\Work\Rigla__Bud_Zdorov\2024-05-07\AM\BZ\Installs\AM_bz_installations_01.04-06.05.csv
2024-09-12 16:46:20: Done!
2024-09-12 16:46:20: Loading CSV - D:\Work\Rigla__Bud_Zdorov\2024-05-07\AM\Rigla\Events\AM_rigla_events_01.04-06.05.csv
2024-09-12 16:46:28: Done!
2024-09-12 16:46:28: Loading CSV - D:\Work\Rigla__Bud_Zdorov\2024-05-07\AM\Rigla\Installs\AM_rigla_installations_01.04-06.05.csv
2024-09-12 16:46:37: Done!
2024-09-12 16:46:37: Checking dates in event_datetime
2024-09-12 16:46:38: Field:event_datetime, 651413 events from 2024-04-01 00:00:15 to 2024-05-06 23:59:47
2024-09-12 16:46:38: Filtering...
2024-09-12 16:46:38: Field:event_datetime, 98009 events from 2024-05-01 00:00:04 to 2024-05-06 23:59:47
2024-09-12 16:46:38: 553404 events filtered
2024-09-12 16:46:39: Checking dates in install_datetime
2024-09-1

In [20]:
am_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161892 entries, 0 to 161891
Data columns (total 54 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   application_id           161892 non-null  object        
 1   app_build_number         161892 non-null  object        
 2   ios_ifa                  15066 non-null   object        
 3   ios_ifv                  55196 non-null   object        
 4   android_id               0 non-null       object        
 5   google_aid               98802 non-null   object        
 6   profile_id               154749 non-null  object        
 7   os_name                  161892 non-null  object        
 8   os_version               161892 non-null  object        
 9   device_manufacturer      161892 non-null  object        
 10  device_model             161892 non-null  object        
 11  device_type              161892 non-null  object        
 12  device_locale   

In [21]:
# Берём заказы с минимальным значением event_datetime
am_orders_min_event_datetime = get_am_orders_min_event_datetime(am_data)
am_orders_min_event_datetime['am_unique_orders'] = int(1)
#print('Содержит только уникальные заказы на уровне бренда: {0}'.format(~am_orders_min_event_datetime.duplicated(subset=['brand', 'order_id']).values.any()))

In [22]:
am_orders_min_event_datetime['is_reinstallation'].fillna('false', inplace=True)

In [23]:
result_path = save_df_to_csv(am_orders_min_event_datetime, PATH_OUTPUT, 'am_orders_for_dwh_matching_' + START_DATE + '__' + END_DATE, separate_by=',', dec='.')

2024-09-12 16:51:22: dataFrame saved to D:\Work\Rigla__Bud_Zdorov\2024-09-12\Output\2024-09-12_165117_am_orders_for_dwh_matching_2024-05-01__2024-05-06.csv


In [37]:
##########################################

In [24]:
xdf = pd.read_csv(result_path, dtype=object)
pd.to_datetime(xdf['event_datetime']).agg(['min','max'])

min   2024-05-01 00:00:04
max   2024-05-06 23:59:52
Name: event_datetime, dtype: datetime64[ns]

In [25]:
xdf.head()

Unnamed: 0,application_id,app_build_number,ios_ifa,ios_ifv,android_id,google_aid,profile_id,os_name,os_version,device_manufacturer,...,product_list,brand_install,publisher_name,tracker_name,install_datetime,is_reinstallation,event_date,event_month,is_suitable_order_in_month,am_unique_orders
0,3412045,20242,,0F623236-6A47-47BF-81C3-88077B41B1F3,,,3782344,ios,17.3.1,Apple,...,"[{'name': 'Новокаин амп. 0,5% 5мл №10', 'sku':...",,PUBLISHER_UNKNOWN,,,False,2024-05-03 00:00:00,2024-05,True,1
1,3412045,20254,53F95A78-14BA-4589-9C62-DE6260E7706B,F297C9B3-29F9-4299-B0D1-7918F310FA25,,,624209,ios,17.4.1,Apple,...,"[{'name': 'Пиаскледин капс. 300мг №30', 'sku':...",,PUBLISHER_UNKNOWN,,,False,2024-05-03 00:00:00,2024-05,True,1
2,3412045,20242,,,,01443c01-7b26-4eef-9e55-a4ffeca0ed25,4413493,android,12,Redmi,...,"[{'name': 'Ксефокам таб.п/о плен. 8мг №10', 's...",,PUBLISHER_UNKNOWN,,,False,2024-05-03 00:00:00,2024-05,True,1
3,3412045,20256,,,,6deafc6f-47e0-43b4-8038-42ec72151858,1305597,android,12,INOI,...,"[{'name': 'Эликвис таб.п/о плен. 5мг №20', 'sk...",,PUBLISHER_UNKNOWN,,,False,2024-05-03 00:00:00,2024-05,True,1
4,3412045,20254,,,,6605cb01-2ebb-4254-99b5-e63a57c822ba,3462007,android,13,Samsung,...,"[{'name': 'Амоксициллин-АКОС таб. 500мг №20', ...",,PUBLISHER_UNKNOWN,,,False,2024-05-03 00:00:00,2024-05,True,1


In [26]:
xdf.columns

Index(['application_id', 'app_build_number', 'ios_ifa', 'ios_ifv',
       'android_id', 'google_aid', 'profile_id', 'os_name', 'os_version',
       'device_manufacturer', 'device_model', 'device_type', 'device_locale',
       'device_ipv6', 'app_version_name', 'app_package_name', 'event_name',
       'event_json', 'event_datetime', 'event_timestamp',
       'event_receive_datetime', 'event_receive_timestamp', 'connection_type',
       'operator_name', 'original_device_model', 'mcc', 'mnc',
       'country_iso_code', 'city', 'appmetrica_device_id', 'installation_id',
       'session_id', 'windows_aid', 'file', 'brand', 'delivery', 'total',
       'value_category', 'promo_code_applied', 'bonuses_utilized', 'order_id',
       'quantity', 'basket_id', 'pharmacy', 'bonuses_amount',
       'loyalty_program', 'product_list', 'brand_install', 'publisher_name',
       'tracker_name', 'install_datetime', 'is_reinstallation', 'event_date',
       'event_month', 'is_suitable_order_in_month', 'am_u

In [27]:
print(xdf.iloc[0])

application_id                                                          3412045
app_build_number                                                          20242
ios_ifa                                                                     NaN
ios_ifv                                    0F623236-6A47-47BF-81C3-88077B41B1F3
android_id                                                                  NaN
google_aid                                                                  NaN
profile_id                                                              3782344
os_name                                                                     ios
os_version                                                               17.3.1
device_manufacturer                                                       Apple
device_model                                                          iPhone 11
device_type                                                               phone
device_locale                           