In [4]:
import pandas as pd

items = pd.read_csv("synthetic_data/items.csv")
receipts = pd.read_csv("synthetic_data/receipts.csv")

In [5]:
receipts = receipts.drop(columns=['date'])
df = pd.merge(items, receipts, on="receipt_id")

In [6]:
df.to_csv('final_dataset.csv', index=False)

In [23]:
mask = (df['category'] == 'алкоголь') & (df['date'].dt.to_period('M') == '2025-12')
total_qty = df.loc[mask, 'qty'].sum()
print(total_qty)

35


In [21]:
df.describe()


Unnamed: 0,item_id,date,price,qty,line_total,n_items,discount,total
count,6964.0,6964,6964.0,6964.0,6964.0,6964.0,6964.0,6964.0
mean,3482.5,2025-01-07 16:55:29.121194752,99.898502,1.251723,124.331798,4.3139,2.597846,535.737346
min,1.0,2024-01-01 00:00:00,12.0,1.0,12.0,1.0,0.0,12.09
25%,1741.75,2024-07-14 00:00:00,25.2675,1.0,31.9375,3.0,0.0,301.35
50%,3482.5,2025-01-03 00:00:00,66.05,1.0,72.81,5.0,0.0,494.03
75%,5223.25,2025-07-16 00:00:00,157.5875,2.0,177.935,6.0,0.0,730.95
max,6964.0,2025-12-29 00:00:00,374.93,2.0,746.46,6.0,213.77,1932.84
std,2010.47797,,90.430623,0.434034,126.765303,1.493623,14.146403,320.616539


In [62]:
df.groupby('category')['line_total'].sum().sort_values(ascending=False)


category
товари для тварин    322279.05
бакалія              144176.09
молочні              106380.24
напої                 78828.30
м'ясо                 49556.05
солодощі              43036.37
злічки                39230.59
алкоголь              25062.14
фрукти                19207.02
сніданки              16024.24
молоко                12536.48
хліб                   9530.07
Name: line_total, dtype: float64

In [41]:
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.month
spend_by_month = df.groupby(df['month'])['line_total'].sum().sort_values(ascending=False)
spend_by_month

month
8     82907.79
11    82234.55
10    76832.69
12    76368.99
9     74417.75
5     72372.28
1     71316.39
7     69284.64
2     69226.11
3     67112.01
4     62930.24
6     60843.20
Name: line_total, dtype: float64

In [None]:
spend_for_beer = df[df["name"].str.contains('пиво', case=False)]['line_total'].sum()
print(spend_for_beer)
spend_for_milk = df[df['name'].str.contains('молоко', case=False)]['line_total'].sum()
print(spend_for_milk)

25062.14
12536.48


In [54]:
import sqlite3

conn = sqlite3.connect('data.db')

df.to_sql('df', conn, index=False)

6964

In [56]:
df.head()

Unnamed: 0,item_id,receipt_id,date,name,price,qty,line_total,category,n_items,discount,total,month
0,1,R000001,2025-10-16,МОЛОКО 1 Л 0.5+0.5,22.09,1,22.09,молоко,1,0.0,22.09,10
1,2,R000002,2024-03-30,корм собак 2кг,315.77,1,315.77,товари для тварин,5,0.0,831.27,3
2,3,R000002,2024-03-30,твердий сир 0.5+0.5,174.64,1,174.64,молочні,5,0.0,831.27,3
3,4,R000002,2024-03-30,Молоко 1 л брендовий,17.94,1,17.94,молоко,5,0.0,831.27,3
4,5,R000002,2024-03-30,корм котячий,236.27,1,236.27,товари для тварин,5,0.0,831.27,3


In [61]:
query = """
SELECT category, SUM(line_total) as total_spent
FROM df
GROUP BY category
ORDER BY total_spent DESC
LIMIT 5;
"""

top_categories = pd.read_sql_query(query, conn)
print(top_categories)


            category  total_spent
0  товари для тварин    322279.05
1            бакалія    144176.09
2            молочні    106380.24
3              напої     78828.30
4              м'ясо     49556.05


In [68]:
df.columns

Index(['item_id', 'receipt_id', 'date', 'name', 'price', 'qty', 'line_total',
       'category', 'n_items', 'discount', 'total', 'month'],
      dtype='object')

In [67]:
query = """
WITH counts AS (
    SELECT n_items, COUNT(*) as num_receipts
    FROM df
    GROUP BY n_items
), most_common AS (
    SELECT n_items
    FROM counts
    WHERE num_receipts = (SELECT MAX(num_receipts) FROM counts)
)
SELECT *
FROM df
WHERE n_items IN (SELECT n_items FROM most_common);
"""
top_5_bills = pd.read_sql_query(query, conn)
print(top_5_bills)

      item_id receipt_id                 date                        name  \
0          10    R000004  2025-11-30 00:00:00               БАТОН 0.5+0.5   
1          11    R000004  2025-11-30 00:00:00  олія оливкова 500мл економ   
2          12    R000004  2025-11-30 00:00:00        МОЛОКО UHT 1Л ЕКОНОМ   
3          13    R000004  2025-11-30 00:00:00                       ТАБАК   
4          14    R000004  2025-11-30 00:00:00                      йогурт   
...       ...        ...                  ...                         ...   
1975     6955    R001999  2024-10-04 00:00:00                  шинка 200г   
1976     6956    R001999  2024-10-04 00:00:00               Молоко UHT 1л   
1977     6957    R001999  2024-10-04 00:00:00                 DogFood 2kg   
1978     6958    R001999  2024-10-04 00:00:00                       табак   
1979     6959    R001999  2024-10-04 00:00:00                    ПИВО 0.5   

       price  qty  line_total           category  n_items  discount    tota