In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
tqdm.pandas()

In [None]:
train = pd.read_parquet('/kaggle/input/international-cp/data/train_dataset_hackaton2023_train.gzip')
test = pd.read_parquet('/kaggle/input/international-cp/data/hackaton2023_test.gzip')
category = pd.read_csv('/kaggle/input/international-cp/parsing_categories.csv')

In [None]:
category_dict = category.set_index('dish_name')['item_category'].to_dict()
train['dish_name'] = train['dish_name'].map(category_dict)
test['dish_name'] = test['dish_name'].map(category_dict)

In [None]:
uniq_categ = train['dish_name'].unique().tolist()
ind_to_label = {ind:uniq_categ[ind] for ind in range(len(uniq_categ))}
label_to_ind = {k:v for v, k in ind_to_label.items()}

In [None]:
dates_dict = train.set_index('customer_id')['date_diff_post'].to_dict()
buys_dict = train.set_index('customer_id')['buy_post'].to_dict()

In [None]:
from joblib import Parallel, delayed

def process_datetime_group(datetime_group):
    revenue_sum = datetime_group['revenue'].sum()
    format_name = datetime_group['format_name'].iloc[0]
    dish_names = datetime_group['dish_name'].tolist()

    return revenue_sum, format_name, dish_names

def process_customer_group(customer_id, customer_group):
    datetime_groups = customer_group.groupby('startdatetime', sort=False)
    processed = [process_datetime_group(group) for _, group in datetime_groups]
    revenue, format_name, dish_name = map(list, zip(*processed))

    return customer_id, {
        'unique_datetimes': datetime_groups.size().index.tolist(),  # Preserves the full datetime
        'revenue': revenue,
        'format_name': format_name,
        'dish_name': dish_name
    }


In [None]:
#train
train['startdatetime'] = pd.to_datetime(train['startdatetime'])
customer_groups = train.groupby('customer_id')
processed_data = Parallel(n_jobs=-1)(delayed(process_customer_group)(customer_id, group) for customer_id, group in tqdm(customer_groups))
final_train = {customer_id: data for customer_id, data in processed_data}

new_train = pd.DataFrame(final_train).T

#test
test['startdatetime'] = pd.to_datetime(test['startdatetime'])
customer_groups = test.groupby('customer_id')
processed_data = Parallel(n_jobs=-1)(delayed(process_customer_group)(customer_id, group) for customer_id, group in tqdm(customer_groups))
final_test = {customer_id: data for customer_id, data in processed_data}

new_test = pd.DataFrame(final_test).T

In [None]:

new_train['date_diff_post'] = new_train['customer_id'].map(dates_dict)
new_train['buy_post'] = new_train['customer_id'].map(buys_dict)

In [None]:
import pandas as pd

def extract_datetime_stats(timestamps):
    days_of_week = [ts.weekday() for ts in timestamps]  # Дни недели
    hours = [ts.hour for ts in timestamps]              # Часы
    minutes = [ts.minute for ts in timestamps]          # Минуты
    days_diff = [(timestamps[i+1] - timestamps[i]).total_seconds() / (60*60*24) for i in range(len(timestamps)-1)] # Разница в днях между соседними походами

    return pd.Series([days_of_week, hours, minutes, days_diff], index=['days_of_week', 'hours', 'minutes', 'days_diff'])

features_time_train = new_train['unique_datetimes'].progress_apply(extract_datetime_stats)
features_time_test = new_test['unique_datetimes'].progress_apply(extract_datetime_stats)

In [None]:
import statistics
def safe_mode(lst):
    try:
        # Попытка найти единственную моду
        return statistics.mode(lst)
    except statistics.StatisticsError:
        # В случае если есть несколько мод, возвращает первую
        return min(statistics.multimode(lst))

In [None]:
def compute_statistics_for_column(column):
    return column.progress_apply(lambda x: pd.Series([len(x), max(x), min(x), sum(x)/len(x), safe_mode(x)],
                                            index=['len', 'max', 'min', 'mean', 'mode']))

def compute_statistics_for_dataframe(df):
    for col in df.columns:
        stats = compute_statistics_for_column(df[col])

        # Создание новых колонок с префиксами
        for stat in stats.columns:
            df[f'{col}_{stat}'] = stats[stat]

    return df

In [None]:
#train
features_time_full_train = compute_statistics_for_dataframe(features_time_train)
features_time_full_train.drop(['days_of_week', 'hours', 'minutes', 'days_diff'], axis=1, inplace=True)

full_train = pd.concat([new_train, features_time_full_train], axis=1)
#test
features_time_full_test = compute_statistics_for_dataframe(features_time_test)
features_time_full_test.drop(['days_of_week', 'hours', 'minutes', 'days_diff'], axis=1, inplace=True)

full_test = pd.concat([new_test, features_time_full_test], axis=1)

In [None]:
#train
features_revenue_train = full_train['revenue'].progress_apply(lambda x: pd.Series([len(x), sum(x), max(x), min(x), safe_mode(x)]))
features_revenue_train.columns = ['revenue_count', 'revenue_total', 'revenue_max', 'revenue_min', 'revenue_mode']
#test
features_revenue_test = full_test['revenue'].progress_apply(lambda x: pd.Series([len(x), sum(x), max(x), min(x), safe_mode(x)]))
features_revenue_test.columns = ['revenue_count', 'revenue_total', 'revenue_max', 'revenue_min', 'revenue_mode']

In [None]:
full_train = pd.concat([full_train, features_revenue_train], axis=1)
full_test = pd.concat([full_test, features_revenue_test], axis=1)

In [None]:
full_test.shape, full_train.shape

# add tfidf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def generate_tfidf_features(df, col):
    bot = df[col]
    corpus = [' '.join(map(str, x)) for x in bot]

    tfidf = TfidfVectorizer()
    tfidf.fit(corpus)

    tfidf_features = pd.DataFrame(
        tfidf.transform(corpus).A,
        columns=[f'tfidf_{col}_{k}' for k in tfidf.get_feature_names_out()]
    )

    return tfidf_features

In [None]:
tfidf_format_name_train = generate_tfidf_features(full_train, 'format_name')
tfidf_dish_name_train = generate_tfidf_features(full_train, 'dish_name')

tfidf_format_name_test = generate_tfidf_features(full_test, 'format_name')
tfidf_dish_name_test = generate_tfidf_features(full_test, 'dish_name')

In [None]:
full_data_train = pd.concat([full_train, tfidf_format_name_train], axis=1)
full_data_train = pd.concat([full_data_train, tfidf_dish_name_train], axis=1)

full_data_test = pd.concat([full_test, tfidf_format_name_test], axis=1)
full_data_test = pd.concat([full_data_test, tfidf_dish_name_test], axis=1)

In [None]:
new_data_train = full_data_train.drop(['format_name', 'dish_name', 'unique_datetimes', 'revenue'], axis=1)
new_data_test = full_data_test.drop(['format_name', 'dish_name', 'unique_datetimes', 'revenue'], axis=1)

In [None]:
import joblib
joblib.dump(new_data_train, '/kaggle/working/new_train_features.pkl')
joblib.dump(new_data_test, '/kaggle/working/new_test_features.pkl')