In [96]:
import sys
import pandas as pd
import numpy as np
import datetime
import math

train_path = './data/train_v2.csv'
store_path = './data/store.csv'

mean_features_path = './train/mean.csv'
mean_features_by_day_path = './train/mean_by_day.csv'
final_train_data = './train/train_dataset.csv'

train_df = pd.read_csv(train_path, sep=',')
store_df = pd.read_csv(store_path, sep=',')

# Getting features

Получаем средние значения отношений продаж к покупателям по каждому магазину

In [89]:
def get_mean_ratio_sales_customers_by_store(df):
    result = []
    for i in range(1, 1116):
        part = df.loc[df['Store'] == i]
        part = part.loc[part['Open'] == 1]
        values = []
        for j in range(0, part.shape[0]):
            values.append(part.iloc[j]['Sales'] / part.iloc[j]['Customers'])
        result.append({'Store': i, 'Mean': np.mean(values)})
    return pd.DataFrame.from_dict(result)
        
mean_features_df = get_mean_ratio_sales_customers_by_store(train_df)
mean_features_df.to_csv(mean_features_path, sep=',')

Получаем средние значения отношений продаж к покупателям по каждому магазину и дню

In [90]:
def get_mean_ratio_sales_customers_by_store_and_date(df):
    result = []
    for i in range(1, 1116):
        part = df.loc[df['Store'] == i]
        part = part.loc[part['Open'] == 1]
        for day in range(1, 7):
            part_by_day = part.loc[part['DayOfWeek'] == day]
            values = []
            for j in range(part_by_day.shape[0]):
                values.append(df.iloc[j]['Sales'] / df.iloc[j]['Customers'])
            result.append({'Store': i, 'DayOfWeek': day,'Mean': np.mean(values)})
    return pd.DataFrame.from_dict(result)

mean_features_by_day_df = get_mean_ratio_sales_customers_by_store_and_date(train_df)
mean_features_by_day_df.to_csv(mean_features_by_day_path, sep=',')

# Getting train dataset

Собираем все фичи в один датасет, плюсом включаем полученные выше, добавляем фичи из доступных значений даты/времени: количество дней проведения акции, проводится ли Promo2 акция в настоящий момент, дней до следующей акции Promo2, если не проводится, количество дней с запуска акций Promo2

Данных мало - заменяем дату на порядковый номер дня в месяце

In [None]:
def monthToNum(month):
    return{
            'Jan' : 1,
            'Feb' : 2,
            'Mar' : 3,
            'Apr' : 4,
            'May' : 5,
            'Jun' : 6,
            'Jul' : 7,
            'Aug' : 8,
            'Sept' : 9, 
            'Oct' : 10,
            'Nov' : 11,
            'Dec' : 12
    }[month]

def get_train_dataset(train_df, store_df, mean_features_df, mean_features_by_day):
    train = []
    for i in range(1, 1116):
        day_after_close = 0
        promo_days = 0
        stateholiday_days = 0
        schoolholiday_days = 0
        data = train_df.loc[train_df['Store'] == i]
        info = store_df.loc[store_df['Store'] == i]       
        # Разбираемся с акциями Promo2 по конкретному магазину
        promo2_is_enabled = info.iloc[0]['Promo2'];
        promo2_months = []
        promo2_datetime = None
        if promo2_is_enabled == 1:
            buf = info.iloc[0]['PromoInterval'].split(',')
            promo2_months = np.array([monthToNum(x) for x in buf])
            promo2_datetime = datetime.datetime.strptime(f"{int(info.iloc[0]['Promo2SinceYear'])}-W{int(info.iloc[0]['Promo2SinceWeek'])}" + '-1', "%Y-W%W-%w")
        # Конкурент
        competition_datetime_enabled = 0
        if not math.isnan(info.iloc[0]['CompetitionOpenSinceYear']) or not math.isnan(info.iloc[0]['CompetitionOpenSinceMonth']):
            competition_datetime_enabled = 1
            competition_datetime = datetime.datetime(int(info.iloc[0]['CompetitionOpenSinceYear']), int(info.iloc[0]['CompetitionOpenSinceMonth']), 1)
        # Перебираем данные
        for j in reversed(range(0, data.shape[0])):
            # Текущая дата как datetime
            current_datetime = datetime.datetime.strptime(data.iloc[j]['Date'], "%Y-%M-%d")
            day_of_month = current_datetime.day
            store_is_open = data.iloc[j]['Open']
            if store_is_open == 1:
                # Праздники
                if data.iloc[j]['StateHoliday'] != '0':
                    stateholiday_days += 1
                else:
                    stateholiday_days = 0  
                # Каникулы
                if data.iloc[j]['SchoolHoliday'] != "0":
                    schoolholiday_days += 1
                else:
                    schoolholiday_days = 0
                # Все по Promo
                if data.iloc[j]['Promo'] == 1:
                    promo_days += 1
                else:
                    promo_days = 0
                # Все по Promo2
                promo2_is_active = 0
                promo2_days = 0           
                promo2_days_to_next = 0
                promo2_all_days = 0
                if promo2_is_enabled == 1:
                    promo2_all_days = (current_datetime - promo2_datetime).days
                    if current_datetime.month in promo2_months:
                        promo2_is_active = 1
                        promo2_days = day_of_month
                    else:
                        # Определяем ближайший месяц, когда начнется Promo2
                        promo2_starts = np.array([datetime.datetime(current_datetime.year, x, 1) for x in promo2_months])
                        buf = []
                        for date in promo2_starts:
                            if date > current_datetime:
                                buf.append((date - current_datetime).days)
                        days_to_next = np.min(buf)
                        if days_to_next < 15:
                            promo2_days_to_next = days_to_next
                # Количество дней со дня открытия конкурента
                competition_days = 0
                competition_distance = 0
                if competition_datetime_enabled == 1:
                    if current_datetime > competition_datetime:
                        competition_days = (current_datetime - competition_datetime).days 
                        competition_distance = info.iloc[0]['CompetitionDistance']
                # Средние значения
                mean = mean_features_df.loc[mean_features_df['Store'] == i]
                if mean.shape[0] == 0:
                    mean = 0
                else:
                     mean.iloc[0]['Mean']                                    
                mean_by_day = mean_features_by_day_df.loc[mean_features_by_day_df['Store'] == i]
                mean_by_day = mean_by_day.loc[mean_by_day['DayOfWeek'] == data.iloc[j]['DayOfWeek']]
                if mean_by_day.shape[0] == 0:
                    mean_by_day = 0
                else:
                    mean_by_day = mean_by_day.iloc[0]['Mean']
                # Все остальные данные "как есть"
                train.append(
                {
                    "Store": i,
                    "DayOfWeek": data.iloc[j]['DayOfWeek'],
                    "DayOfMonth": day_of_month,
                    "Open": store_is_open,
                    "Sales": data.iloc[j]['Sales'],
                    "Customers": data.iloc[j]['Customers'],
                    "Promo": data.iloc[j]['Promo'],
                    "PromoDays": promo_days,
                    "StateHoliday": data.iloc[j]['StateHoliday'],
                    "StateHolidayDays": stateholiday_days,
                    "SchoolHoliday": data.iloc[j]['SchoolHoliday'],
                    "SchoolHolidayDays": schoolholiday_days,
                    "StoreType": info.iloc[0]['StoreType'],
                    "Assortment": info.iloc[0]['Assortment'],
                    "CompetitionDistance": competition_distance,
                    "CompetitionDays": competition_days,
                    "Promo2": promo2_is_enabled,
                    "Promo2Active": promo2_is_active,
                    "Promo2Days": promo2_days,
                    "Promo2DaysToNext": promo2_days_to_next,
                    "Promo2AllDays": promo2_all_days,
                    "DayAfterClose": day_after_close,
                    "Mean": mean,
                    "MeanByDay": mean_by_day
                })
                day_after_close = 0
            else:
                day_after_close = 1
    return pd.DataFrame.from_dict(result)
                                                         
df = get_train_dataset(train_df, store_df, mean_features_df, mean_features_by_day_df)
df.to_csv(final_train_data, sep=',')