In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import sqlite3
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [3]:
sales = pd.read_csv('data/sales.csv')

In [4]:
sales.columns

Index(['Unnamed: 0', 'id', 'quantity', 'name', 'price', 'fs_receipt_id',
       'fs_receipt_issue_date', 'org_id', 'org_ico', 'org_dic',
       'org_building_number', 'org_country', 'org_ic_dph', 'org_municipality',
       'org_postal_code', 'org_name', 'org_street_name', 'unit_id',
       'unit_building_number', 'unit_country', 'unit_municipality',
       'unit_postal_code', 'unit_property_registration_number',
       'unit_street_name', 'unit_name', 'ai_name_without_brand_and_quantity',
       'ai_name_in_english_without_brand_and_quantity', 'ai_brand',
       'ai_category', 'ai_quantity_value', 'ai_quantity_unit', 'unit_latitude',
       'unit_longitude'],
      dtype='object')

In [5]:
sales.isna().sum()

Unnamed: 0                                          0
id                                                  0
quantity                                            0
name                                                0
price                                               0
fs_receipt_id                                       0
fs_receipt_issue_date                               0
org_id                                              0
org_ico                                             0
org_dic                                             0
org_building_number                              6857
org_country                                         0
org_ic_dph                                         71
org_municipality                                    0
org_postal_code                                  8434
org_name                                            0
org_street_name                                     0
unit_id                                             0
unit_building_number        

In [6]:
def clean_org_name(name: str) -> str:
    if pd.isna(name):
        return name
    name = re.sub(
        r'\b(s\s*\.?\s*r\s*\.?\s*o\.?|a\s*\.?\s*s\.?|v\s*\.?\s*o\s*\.?\s*s\.?|spol\s*\.?\s*s\s*r\s*\.?\s*o\.?)\b',
        '',
        name,
        flags=re.IGNORECASE
    )
    name = re.sub(r'\b(Slovenská republika|Slovensko|SR|Slovakia|SK)\b', '', name, flags=re.IGNORECASE)
    name = re.sub(r'[.,/]', ' ', name)
    name = re.sub(r'\s{2,}', ' ', name)
    name = name.strip(" -_")
    if 'COOP' in name.upper():
        name = 'COOP Jednota'
    return name

In [7]:
sales.drop('Unnamed: 0', axis=1, inplace=True)
sales = sales.rename(columns={'fs_receipt_id':'receipt_id', 'fs_receipt_issue_date': 'issue_date', 'org_country': 'country',
                              'unit_municipality': 'city', 'unit_street_name': 'street', 'ai_name_without_brand_and_quantity': 'product_name',
                              'ai_name_in_english_without_brand_and_quantity': 'product_name_en', 'ai_brand': 'brand', 'ai_category': 'category',
                              'ai_quantity_value': 'quantity_value', 'ai_quantity_unit': 'quantity_unit', 'unit_latitude': 'latitude', 'unit_longitude': 'longitude'
                              })
sales.loc[sales['city'].isna(), 'city'] = (sales['org_municipality'].str.split('-').str[0].str.strip())
sales.loc[sales['street'].isna(), 'street'] = (sales['org_street_name'].str.split('-').str[0].str.strip())
sales["issue_date"] = pd.to_datetime(sales["issue_date"],errors="coerce").dt.tz_localize(None)
sales['month'] = sales['issue_date'].dt.month
sales['year'] = sales['issue_date'].dt.year
sales['month_year'] = sales['issue_date'].dt.to_period('M').astype(str)
sales['weekday'] = sales['issue_date'].dt.day_name()
sales['receipt_price'] = sales['price'] * sales['quantity']
sales['is_luxury'] = np.where(sales['price'] > 50, 1, 0)
sales['is_bio'] = np.where(sales['name'].str.contains('bio', na=False, case=False), 1, 0)
sales['is_eco'] = np.where(sales['name'].str.contains('eco', na=False, case=False), 1, 0)
sales['org_name'] = sales['org_name'].apply(clean_org_name)
sales.loc[sales['city'].str.contains('Bratislava', na=False, case=False), 'city'] = 'Bratislava'
sales['city'] = sales['city'].str.split('-').str[0].str.strip()
sales.loc[sales['city'].str.contains('Šaštín', na=False, case=False), 'city'] = 'Šaštín-Stráže'

In [8]:
for col in sales.select_dtypes(include=['object']).columns:
    sales[col] = sales[col].fillna("Unknown")
for col in sales.select_dtypes(include=['number']).columns:
    sales[col] = sales[col].fillna(0)

In [9]:
MAIN_CATS = {
    "Food": [
        "food", "fresh", "bakery", "bread", "pastry", "dessert", "snack", "sweets", "confection",
        "meat", "fish", "seafood", "sausage", "pork", "deli", "pasta", "noodle",
        "rice", "grain", "cereal", "breakfast", "sauce", "condiment", "spice", "season",
        "oil", "vinegar", "butter", "cheese", "yogurt", "dairy", "egg",
        "fruit", "veget", "produce", "mushroom", "frozen", "instant", "spread",
        "jam", "honey", "breadcrumb", "pizza", "meal", "prepared"
    ],
    "Beverages": [
        "drink", "beverage", "water", "mineral", "juice", "coffee", "tea",
        "beer", "wine", "alcohol", "alcoholic", "non-alcoholic", "soft", "syrup"
    ],
    "Home & Household": [
        "home", "household", "clean", "laundry", "detergent", "soap", "tissue", "paper",
        "kitchen", "tableware", "container", "packaging", "bag", "waste",
        "garden", "plants", "decor", "decoration", "bedding", "furniture",
        "maintenance", "repair", "construction", "building", "hardware", "tool", "lighting",
        "appliance", "office supplies", "stationery"
    ],
    "Health & Personal Care": [
        "health", "pharma", "medicine", "medic", "medical", "vaccine", "test",
        "vitamin", "supplement", "ointment", "diet", "nutrit",
        "hygiene", "personal care", "cosmetic", "beauty", "fragrance", "perfume",
        "oral care", "dental", "shampoo", "sunscreen", "cream"
    ],
    "Services & Other": [
        "fashion", "apparel", "clothes", "clothing", "footwear", "shoes", "underwear", "hosiery", "accessor",
        "bag", "bags", "eyewear",
        "book", "magazine", "media", "music", "toy", "game", "trading card", "collectible",
        "electronic",
        "pet", "zvier", "krmivo", "granule",
        "auto", "fuel",
        "service", "ticket", "admission", "accommodation", "logistic", "shipping", "lottery", "tax", "recycling", "Unknown"
    ],
}
def norm(s: str) -> str:
    s = (s or "").lower()
    s = s.replace("&", " and ")
    s = re.sub(r"[/\-_,]+", " ", s)
    return s
def map_main_category(raw: str) -> str:
    t = norm(raw)
    for main, keys in MAIN_CATS.items():
        if any(k in t for k in keys):
            return main
    return "Services & Other"

sales['main_category'] = sales['category'].apply(map_main_category)

In [10]:
group = sales.groupby('main_category')['receipt_price'].sum().sort_values(ascending=False)        
group

main_category
Food                      16295.35
Services & Other          10852.80
Home & Household           5733.55
Beverages                  4020.30
Health & Personal Care     2477.78
Name: receipt_price, dtype: float64

In [11]:
import sqlite3
conn = sqlite3.connect('sales.db')
sales.to_sql('sales', conn, if_exists='replace', index=False)
conn.close()

In [12]:
sales.columns

Index(['id', 'quantity', 'name', 'price', 'receipt_id', 'issue_date', 'org_id',
       'org_ico', 'org_dic', 'org_building_number', 'country', 'org_ic_dph',
       'org_municipality', 'org_postal_code', 'org_name', 'org_street_name',
       'unit_id', 'unit_building_number', 'unit_country', 'city',
       'unit_postal_code', 'unit_property_registration_number', 'street',
       'unit_name', 'product_name', 'product_name_en', 'brand', 'category',
       'quantity_value', 'quantity_unit', 'latitude', 'longitude', 'month',
       'year', 'month_year', 'weekday', 'receipt_price', 'is_luxury', 'is_bio',
       'is_eco', 'main_category'],
      dtype='object')

In [13]:
sales['city'].value_counts().head(10)

city
Bratislava          6793
Šaštín-Stráže        519
Štefanov             381
Senica               249
Kúty                 182
Stupava               42
Košice                38
Slovenský Grob        31
Banská Štiavnica      27
Jelka                 23
Name: count, dtype: int64

In [14]:
sales.columns

Index(['id', 'quantity', 'name', 'price', 'receipt_id', 'issue_date', 'org_id',
       'org_ico', 'org_dic', 'org_building_number', 'country', 'org_ic_dph',
       'org_municipality', 'org_postal_code', 'org_name', 'org_street_name',
       'unit_id', 'unit_building_number', 'unit_country', 'city',
       'unit_postal_code', 'unit_property_registration_number', 'street',
       'unit_name', 'product_name', 'product_name_en', 'brand', 'category',
       'quantity_value', 'quantity_unit', 'latitude', 'longitude', 'month',
       'year', 'month_year', 'weekday', 'receipt_price', 'is_luxury', 'is_bio',
       'is_eco', 'main_category'],
      dtype='object')

In [15]:
sales[sales['name'] == '5904862542003 OBUV CP40-SC2116']

Unnamed: 0,id,quantity,name,price,receipt_id,issue_date,org_id,org_ico,org_dic,org_building_number,country,org_ic_dph,org_municipality,org_postal_code,org_name,org_street_name,unit_id,unit_building_number,unit_country,city,unit_postal_code,unit_property_registration_number,street,unit_name,product_name,product_name_en,brand,category,quantity_value,quantity_unit,latitude,longitude,month,year,month_year,weekday,receipt_price,is_luxury,is_bio,is_eco,main_category
66,2153,1.0,5904862542003 OBUV CP40-SC2116,22.99,193,2023-06-27 17:37:23,24,46509500,2023414492,Unknown,Slovensko,SK2023414492,Bratislava - mestská časť Ružinov,0.0,CCC,Ivanská cesta,28,Unknown,Slovensko,Bratislava,82108.0,6,Metodova,Unknown,Unknown,Unknown,Unknown,Auto Care,0.0,Unknown,48.15702,17.129524,6,2023,2023-06,Tuesday,22.99,0,0,0,Services & Other


In [16]:
sales.head(5)

Unnamed: 0,id,quantity,name,price,receipt_id,issue_date,org_id,org_ico,org_dic,org_building_number,country,org_ic_dph,org_municipality,org_postal_code,org_name,org_street_name,unit_id,unit_building_number,unit_country,city,unit_postal_code,unit_property_registration_number,street,unit_name,product_name,product_name_en,brand,category,quantity_value,quantity_unit,latitude,longitude,month,year,month_year,weekday,receipt_price,is_luxury,is_bio,is_eco,main_category
0,1212,1.0,4-71-12-0641-8 30.0,29.95,71,2022-01-18 16:12:53,28,35832932,2020215912,Unknown,Slovensko,SK2020215912,Bratislava - mestská časť Petržalka,0.0,Leder & Schuh,Einsteinova,33,Unknown,Slovensko,Bratislava,82109.0,16,Mlynské Nivy,Unknown,Unknown,Unknown,Unknown,Vitamins & Ointments,0.0,Unknown,48.145888,17.127227,1,2022,2022-01,Tuesday,29.95,0,0,0,Health & Personal Care
1,1211,1.0,4-71-12-0695-8 33.0,37.95,71,2022-01-18 16:12:53,28,35832932,2020215912,Unknown,Slovensko,SK2020215912,Bratislava - mestská časť Petržalka,0.0,Leder & Schuh,Einsteinova,33,Unknown,Slovensko,Bratislava,82109.0,16,Mlynské Nivy,Unknown,Unknown,Unknown,Unknown,Baking,0.0,Unknown,48.145888,17.127227,1,2022,2022-01,Tuesday,37.95,0,0,0,Services & Other
2,1213,1.0,5904248240332 OBUV CI12-JOY-03(III)CH,39.99,72,2022-03-24 16:06:02,24,46509500,2023414492,Unknown,Slovensko,SK2023414492,Bratislava - mestská časť Ružinov,0.0,CCC,Ivanská cesta,28,Unknown,Slovensko,Bratislava,82108.0,6,Metodova,Unknown,Obuv CI12-JOY-03(III)CH,Footwear CI12-JOY-03(III)CH,Unknown,Footwear,0.0,Unknown,48.15702,17.129524,3,2022,2022-03,Thursday,39.99,0,0,0,Services & Other
3,1214,1.0,5900949525822 PRÍSLUŠENSTVO,2.95,72,2022-03-24 16:06:02,24,46509500,2023414492,Unknown,Slovensko,SK2023414492,Bratislava - mestská časť Ružinov,0.0,CCC,Ivanská cesta,28,Unknown,Slovensko,Bratislava,82108.0,6,Metodova,Unknown,Accessory,Unknown,Unknown,Unknown,0.0,Unknown,48.15702,17.129524,3,2022,2022-03,Thursday,2.95,0,0,0,Services & Other
4,1148,1.0,5904248405014 OBUV CI12-HARRY-01,39.99,59,2022-05-06 18:21:47,24,46509500,2023414492,Unknown,Slovensko,SK2023414492,Bratislava - mestská časť Ružinov,0.0,CCC,Ivanská cesta,28,Unknown,Slovensko,Bratislava,82108.0,6,Metodova,Unknown,Obuv,Footwear,Unknown,Accessories,0.0,Unknown,48.15702,17.129524,5,2022,2022-05,Friday,39.99,0,0,0,Services & Other


In [None]:
sales.sort_values(by='receipt_price', ascending=False).head(5)

Unnamed: 0,id,quantity,name,price,receipt_id,issue_date,org_id,org_ico,org_dic,org_building_number,country,org_ic_dph,org_municipality,org_postal_code,org_name,org_street_name,unit_id,unit_building_number,unit_country,city,unit_postal_code,unit_property_registration_number,street,unit_name,product_name,product_name_en,brand,category,quantity_value,quantity_unit,latitude,longitude,month,year,month_year,weekday,receipt_price,is_luxury,is_bio,is_eco,main_category
7966,9976,4.0,"Bet. zmes C16/20 XC1, Dmax4 poter - m3",108.24,1245,2025-09-22 07:54:13,277,35779403,2020207420,Unknown,Slovensko,SK2020207420,Malacky,0.0,BESTAV,Továrenská,367,Unknown,Slovensko,Šaštín-Stráže,90841.0,1004,Zápotočná,Unknown,Betónová Zmes,Concrete Mix,Unknown,Construction Material,1.0,m3,48.628408,17.157706,9,2025,2025-09,Monday,432.96,1,0,0,Home & Household
5388,9864,2.0,Vila Jánošík objekt/noc,200.0,1225,2025-02-27 09:06:12,188,165549,2021095670,Unknown,Slovensko,SK2021095670,Banská Bystrica,0.0,Fakultná nemocnica s poliklinikou F D Roosevelta Banská Bystrica štátna príspevková organizácia,Nám. L. Svobodu,240,Unknown,Slovensko,Partizánska Ľupča,3215.0,1,Železnô,Unknown,Ubytovanie za noc,Accommodation per night,Unknown,Accommodation,0.0,Unknown,48.95365,19.393494,2,2025,2025-02,Thursday,400.0,1,0,0,Services & Other
7213,9110,3.5,"Bet. zmes C16/20 XC1, Dmax4 poter - m3",108.24,1090,2025-07-28 11:20:54,277,35779403,2020207420,Unknown,Slovensko,SK2020207420,Malacky,0.0,BESTAV,Továrenská,367,Unknown,Slovensko,Šaštín-Stráže,90841.0,1004,Zápotočná,Unknown,Betónová zmes,Concrete Mix,Unknown,Construction Materials,1.0,m3,48.628408,17.157706,7,2025,2025-07,Monday,378.84,1,0,0,Home & Household
7579,8710,3.0,"Bet. zmes C16/20 XC1, Dmax4 poter - m3",108.24,1029,2025-08-26 07:55:18,277,35779403,2020207420,Unknown,Slovensko,SK2020207420,Malacky,0.0,BESTAV,Továrenská,367,Unknown,Slovensko,Šaštín-Stráže,90841.0,1004,Zápotočná,Unknown,"Betónová zmes C16/20 XC1, Dmax4 poter","Concrete Mix C16/20 XC1, Dmax4 screed",Unknown,Building Materials,1.0,m3,48.628408,17.157706,8,2025,2025-08,Tuesday,324.72,1,0,0,Home & Household
4369,6873,8.0,Team up dospeli 90 min,23.9,763,2024-07-10 17:43:05,223,50243411,2120248878,Unknown,Slovensko,SK2120248878,Bratislava - mestská časť Staré Mesto,0.0,Team Up,Na kopci,283,Unknown,Slovensko,Bratislava,82104.0,4B,Studená,Unknown,Dospelí Aktivita,Adults Activity,Team Up,Activity,90.0,min,48.183316,17.179002,7,2024,2024-07,Wednesday,191.2,0,0,0,Services & Other
4888,7032,2.0,Tasting Menu,95.0,783,2024-09-14 22:03:46,231,53883349,2121527496,4A,Slovensko,SK2121527496,Bratislava - mestská časť Staré Mesto,0.0,Irin,Podjavorinskej,294,Unknown,Slovensko,Bratislava,81101.0,2,Rudnayovo námestie,Unknown,Degustačné Menu,Tasting Menu,Unknown,home/maintenance,0.0,Unknown,48.142221,17.10576,9,2024,2024-09,Saturday,190.0,1,0,0,Home & Household
2453,5379,1.0,ŠATY FAIT TULLE,189.0,583,2024-02-07 16:33:04,173,45587850,2023065429,Unknown,Slovensko,SK2023065429,Šaľa,0.0,LA MODA,Diakovská cesta,215,Unknown,Slovensko,Bratislava,82109.0,16,Mlynské Luhy,Unknown,Šaty Tulle,Tulle Dress,Unknown,Clothing,0.0,Unknown,48.154254,17.179568,2,2024,2024-02,Wednesday,189.0,1,0,0,Services & Other
5185,7677,1.0,03 P0WYAX5 D-ŠATY,159.99,891,2025-02-06 18:59:01,6,35821272,2021596159,Unknown,Slovensko,SK2021596159,Bratislava - mestská časť Nové Mesto,0.0,Peek & Cloppenburg,Jarošova,8,Unknown,Slovensko,Bratislava,81109.0,8,PRIBINOVA,Unknown,Šaty,Dress,Unknown,Clothing,0.0,Unknown,48.140201,17.12305,2,2025,2025-02,Thursday,159.99,1,0,0,Services & Other
7623,7543,16.0,"Dzintonik 0,2l",8.5,878,2025-08-27 22:05:48,59,35849592,2020239463,Unknown,Slovensko,SK2020239463,Bratislava - mestská časť Ružinov,0.0,Medusa Restaurants,Súťažná,325,Unknown,Slovensko,Bratislava,82108.0,64,Krížna,Unknown,Tonic,Tonic Water,Unknown,Beverage,0.2,l,48.15789,17.128291,8,2025,2025-08,Wednesday,136.0,0,0,0,Beverages
823,6091,8.0,Ubytovanie turistické (deti od 6 r. a dospelí)/noc,17.0,666,2023-09-16 10:29:56,188,165549,2021095670,Unknown,Slovensko,SK2021095670,Banská Bystrica,0.0,Fakultná nemocnica s poliklinikou F D Roosevelta Banská Bystrica štátna príspevková organizácia,Nám. L. Svobodu,240,Unknown,Slovensko,Partizánska Ľupča,3215.0,1,Železnô,Unknown,Ubytovanie turistické,Tourist Accommodation,Unknown,Accommodation,0.0,Unknown,48.95365,19.393494,9,2023,2023-09,Saturday,136.0,0,0,0,Services & Other


In [None]:
sales['']