In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import sqlite3
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [2]:
sales = pd.read_csv('data/sales.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/sales.csv'

In [None]:
sales.columns

Index(['Unnamed: 0', 'id', 'quantity', 'name', 'price', 'fs_receipt_id',
       'fs_receipt_issue_date', 'org_id', 'org_ico', 'org_dic',
       'org_building_number', 'org_country', 'org_ic_dph', 'org_municipality',
       'org_postal_code', 'org_name', 'org_street_name', 'unit_id',
       'unit_building_number', 'unit_country', 'unit_municipality',
       'unit_postal_code', 'unit_property_registration_number',
       'unit_street_name', 'unit_name', 'ai_name_without_brand_and_quantity',
       'ai_name_in_english_without_brand_and_quantity', 'ai_brand',
       'ai_category', 'ai_quantity_value', 'ai_quantity_unit', 'unit_latitude',
       'unit_longitude'],
      dtype='object')

In [None]:
sales.isna().sum()

Unnamed: 0                                          0
id                                                  0
quantity                                            0
name                                                0
price                                               0
fs_receipt_id                                       0
fs_receipt_issue_date                               0
org_id                                              0
org_ico                                             0
org_dic                                             0
org_building_number                              6857
org_country                                         0
org_ic_dph                                         71
org_municipality                                    0
org_postal_code                                  8434
org_name                                            0
org_street_name                                     0
unit_id                                             0
unit_building_number        

In [None]:
def clean_org_name(name: str) -> str:
    if pd.isna(name):
        return name
    name = re.sub(
        r'\b(s\s*\.?\s*r\s*\.?\s*o\.?|a\s*\.?\s*s\.?|v\s*\.?\s*o\s*\.?\s*s\.?|spol\s*\.?\s*s\s*r\s*\.?\s*o\.?)\b',
        '',
        name,
        flags=re.IGNORECASE
    )
    name = re.sub(r'\b(SlovenskÃ¡ republika|Slovensko|SR|Slovakia|SK)\b', '', name, flags=re.IGNORECASE)
    name = re.sub(r'[.,/]', ' ', name)
    name = re.sub(r'\s{2,}', ' ', name)
    name = name.strip(" -_")
    if 'COOP' in name.upper():
        name = 'COOP Jednota'
    return name

In [None]:
sales.drop('Unnamed: 0', axis=1, inplace=True)
sales = sales.rename(columns={'fs_receipt_id':'receipt_id', 'fs_receipt_issue_date': 'issue_date', 'org_country': 'country',
                              'unit_municipality': 'city', 'unit_street_name': 'street', 'ai_name_without_brand_and_quantity': 'product_name',
                              'ai_name_in_english_without_brand_and_quantity': 'product_name_en', 'ai_brand': 'brand', 'ai_category': 'category',
                              'ai_quantity_value': 'quantity_value', 'ai_quantity_unit': 'quantity_unit', 'unit_latitude': 'latitude', 'unit_longitude': 'longitude'
                              })
sales.loc[sales['city'].isna(), 'city'] = (sales['org_municipality'].str.split('-').str[0].str.strip())
sales.loc[sales['street'].isna(), 'street'] = (sales['org_street_name'].str.split('-').str[0].str.strip())
sales["issue_date"] = pd.to_datetime(sales["issue_date"],errors="coerce").dt.tz_localize(None)
sales['month'] = sales['issue_date'].dt.month
sales['year'] = sales['issue_date'].dt.year
sales['month_year'] = sales['issue_date'].dt.to_period('M').astype(str)
sales['weekday'] = sales['issue_date'].dt.day_name()
sales['receipt_price'] = sales['price'] * sales['quantity']
sales['is_luxury'] = np.where(sales['price'] > 50, 1, 0)
sales['is_bio'] = np.where(sales['name'].str.contains('bio', na=False, case=False), 1, 0)
sales['is_eco'] = np.where(sales['name'].str.contains('eco', na=False, case=False), 1, 0)
sales['org_name'] = sales['org_name'].apply(clean_org_name)


In [None]:
for col in sales.select_dtypes(include=['object']).columns:
    sales[col] = sales[col].fillna("Unknown")
for col in sales.select_dtypes(include=['number']).columns:
    sales[col] = sales[col].fillna(0)

In [None]:
import sqlite3
conn = sqlite3.connect('sales.db')
sales.to_sql('sales', conn, if_exists='replace', index=False)
conn.close()