In [1]:
import pandas as pd
import psycopg2
import os, io
from dotenv import load_dotenv    

In [2]:
query_products = """
SELECT id AS variant_id,
       catalog_category_id AS category_id,
       catalog_brand_id AS brand_id,
       status,
       is_new,
       reviews_count,
       reviews_average_score_price,
       reviews_average_score_quality,
       reviews_average_score_properties,
       reviews_average_score_overall,
       in_sets_count,
       is_in_stock,
       is_returnable,
       purchase_price,
       eshop_stock_count,
       country_of_origin_code,
       name
       
       FROM catalog_products
"""

In [3]:
load_dotenv()

True

In [4]:
def create_db_conn():
    try:
        conn = psycopg2.connect(host=os.getenv('DB_HOST'), database=os.getenv('DB_NAME'),
                                             user=os.getenv('DB_USER'), password=os.getenv('DB_PASSWORD'),
                                         port=int(os.getenv('DB_PORT')))
        return conn
    except psycopg2.DatabaseError as e:
        print(f'database connection {e}')
        return None
    except Exception as e:
        print(f'unknown error {e}')
        return None


def read_sql_iostream(query: str, block_mergejoin=False, block_hashjoin=False, block_seqscan=False) -> pd.DataFrame:
    """
    More effective way of loading content of database table to dataframe using io stream - StringIO.
    :param str query: Query select for accessing data in table.
    :param con: Connection to concrete database.
    :return pd.Dataframe: Output dataframe loaded from database.
    """
    try:
        con = create_db_conn()
        cur = con.cursor()
        copy_sql = f"COPY ({query.strip().rstrip(';')}) TO STDOUT WITH CSV HEADER"
        store = io.StringIO()
        cur.copy_expert(copy_sql, store)
        store.seek(0)
        df = pd.read_csv(store, na_values=['NULL', 'NaN', 'nan', 'null', ''], keep_default_na=False)
    except Exception as e:
        raise e
    finally:
        try:
            cur.close()
            con.close()
        except Exception as e:
            print(f'error- {e}')
            pass
    return df

In [5]:
# Check whether the data folder exists or not
if not os.path.exists('./data'):
    os.makedirs('./data')

In [6]:
def offset_query(limit, value_offset):
    return  f"""
    SELECT
        so.price_without_vat AS order_price_without_vat,
        so.price_with_vat AS order_price_with_vat,
        so.bill_country,
        so.setting_currency_id,
        so.created_at,
        so.shop_basket_id,
        so.doc_date,
        so.exchange_currency_rate,
        so.source_type AS source,
        so.canceled_date,

        sc.code AS currency_code,
        sc.currency_symbol,
        sc.price_round_system,


        sb.total_price_before_discount_with_vat AS basket_total_price_before_discount_with_vat,
        sb.total_price_with_vat AS basket_total_price_with_vat,
        sb.count_basket_items,
        sb.count_products AS basket_count_products,
        sb.basket_type,

        sbi.quantity AS item_quantity,
        sbi.item_type,
        sbi.unit_price_with_vat AS item_unit_price_with_vat,
        sbi.unit_price_without_vat AS item_unit_price_without_vat,
        sbi.total_discount_with_vat AS item_total_discount_with_vat,


        cp.id as product_id,
        cp.code AS product_code,
        cp.catalog_category_id,
        cp.catalog_brand_id,
        cp.name AS product_name,
        cp.status AS product_status,
        cp.reviews_count,
        cp.reviews_average_score_price,
        cp.reviews_average_score_quality,
        cp.reviews_average_score_properties,
        cp.reviews_average_score_overall,
        cp.reviews_average_score,
        cp.is_in_stock,
        cp.is_ended,
        cp.is_new,
        cp.is_boosted,
        cp.purchase_price AS product_purchase_price,
        cp.eshop_stock_count,
        cp.is_fifo,
        cp.name_parameterize AS product_name_parameterize,

        cc.name AS category,
        cc.tree_path,
        cc.name_parameterize AS category_name_parameterized,
        cc.status AS category_status,
        cc.catalog_segment_id,
        cc.ancestor_ids AS categories_ancestor_ids,
        cc.descendant_ids AS categories_descendant_ids,
        cc.full_name_path AS category_full_name_path,
        cc.default_warranty_period,

        cb.name AS brand_name,
        cb.name_parameterize AS brand_parameterized,

        cs.name AS segment_name,
        cs.name_parameterize AS segment_parameterezied,
        cs.status AS segment_status


    FROM shop_orders so
    LEFT JOIN setting_currencies sc ON so.setting_currency_id = sc.id
    INNER JOIN shop_baskets sb ON sb.id = so.shop_basket_id
    LEFT JOIN shop_basket_items sbi ON  sb.id = sbi.shop_basket_id
    INNER JOIN catalog_products cp ON cp.id = sbi.catalog_product_id
    LEFT JOIN catalog_categories cc ON cp.catalog_category_id = cc.id
    LEFT JOIN catalog_brands cb ON cp.catalog_brand_id = cb.id
    LEFT JOIN catalog_segments cs ON cs.id = cp.catalog_segment_id
    LIMIT {limit}
    OFFSET {value_offset}
    """

In [7]:
## Approximately 3,650 mil rows, if done differently it crashes pandas
data_0 = (read_sql_iostream(offset_query(500000, 500000*0)))

In [8]:
data_1 = (read_sql_iostream(offset_query(500000, 500000*1)))

In [9]:
data_2 = (read_sql_iostream(offset_query(500000, 500000*2)))

In [10]:
data_3 = (read_sql_iostream(offset_query(500000, 500000*3)))

In [11]:
data_4 = (read_sql_iostream(offset_query(500000, 500000*4)))

In [12]:
data_5 = (read_sql_iostream(offset_query(500000, 500000*5)))

In [13]:
data_6 = (read_sql_iostream(offset_query(500000, 500000*6)))

In [14]:
data_7 = (read_sql_iostream(offset_query(500000, 500000*7)))

In [15]:
products = read_sql_iostream(query_products)
products

Unnamed: 0,variant_id,category_id,brand_id,status,is_new,reviews_count,reviews_average_score_price,reviews_average_score_quality,reviews_average_score_properties,reviews_average_score_overall,in_sets_count,is_in_stock,is_returnable,purchase_price,eshop_stock_count,country_of_origin_code,name
0,353734,3495.0,7808.0,draft,f,0,0.0,0.0,0.0,0.0,0,t,f,65.33333,0.0,GB,Stereoizer
1,241263,1203.0,1390.0,active,f,0,0.0,0.0,0.0,0.0,0,f,t,293.97000,0.0,UA,Sport Pro Leather Jacket Black/White 48
2,25632,332.0,32.0,ended,f,0,0.0,0.0,0.0,0.0,0,f,t,,0.0,JP,WJ336300
3,25633,196.0,107.0,ended,f,0,0.0,0.0,0.0,0.0,0,f,t,,0.0,,Vintage Gold
4,311334,,2.0,draft,f,0,0.0,0.0,0.0,0.0,0,f,t,,0.0,,(B-Stock) #947558
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352486,75502,1131.0,872.0,ended,f,0,0.0,0.0,0.0,0.0,0,f,t,,0.0,VN,Boys Therma Top Hz University Red/White S
352487,78948,1131.0,921.0,ended,f,0,0.0,0.0,0.0,0.0,0,f,t,,0.0,CN,Dwayne Tour Insula Mens Sweater Kings Blue 3XL
352488,81765,1131.0,866.0,ended,f,0,0.0,0.0,0.0,0.0,0,f,t,,0.0,CN,1/4 Zip Blocked Mens Sweater Caviar XL
352489,162843,60.0,616.0,archived,f,0,0.0,0.0,0.0,0.0,0,f,t,17.08000,0.0,CN,Soprano Ukulele Set (B-Stock) #923506


In [16]:
products.to_csv('data/products.csv', index=False)

In [17]:
full_orders = pd.concat([data_0, data_1])
full_orders = pd.concat([full_orders, data_2])
full_orders = pd.concat([full_orders, data_3])
full_orders = pd.concat([full_orders, data_4])
full_orders = pd.concat([full_orders, data_5])
full_orders = pd.concat([full_orders, data_6])
full_orders = pd.concat([full_orders, data_7])

In [18]:
full_orders.to_csv('data/data.csv', index=False)

In [19]:
full_orders

Unnamed: 0,order_price_without_vat,order_price_with_vat,bill_country,setting_currency_id,created_at,shop_basket_id,doc_date,exchange_currency_rate,source,canceled_date,...,catalog_segment_id,categories_ancestor_ids,categories_descendant_ids,category_full_name_path,default_warranty_period,brand_name,brand_parameterized,segment_name,segment_parameterezied,segment_status
0,562.29,674.75,BG,1,2020-04-26 19:25:20.842503,1136409,2020-04-26,1.9558,eshop,,...,1.0,"{3,4,178}",{},"{Music,Guitars,Capos,""Capo for acoustic guitar""}",24.0,Musedo,musedo,Music,music,active
1,562.29,674.75,BG,1,2020-04-26 19:25:20.842503,1136409,2020-04-26,1.9558,eshop,,...,1.0,"{3,607,760,761}",{},"{Music,""Studio / PA"",Cables,""Complete Cables"",...",24.0,Lewitz,lewitz,Music,music,active
2,562.29,674.75,BG,1,2020-04-26 19:25:20.842503,1136409,2020-04-26,1.9558,eshop,,...,1.0,"{3,4,151}",{},"{Music,Guitars,""Guitar Picks"",""Medium Picks""}",24.0,Fender,fender,Music,music,active
3,562.29,674.75,BG,1,2020-04-26 19:25:20.842503,1136409,2020-04-26,1.9558,eshop,,...,1.0,"{3,4}",{},"{Music,Guitars,""Guitar Foot Rest""}",24.0,GEWA,gewa,Music,music,active
4,562.29,674.75,BG,1,2020-04-26 19:25:20.842503,1136409,2020-04-26,1.9558,eshop,,...,1.0,"{3,4,151}",{},"{Music,Guitars,""Guitar Picks"",""Light Picks""}",24.0,Boss,boss,Music,music,active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161842,12958.68,15680.00,CZ,4,2019-11-25 11:43:38.696209,680206,2019-11-25,26.4680,eshop,,...,1.0,"{3,283}",{},"{Music,Keys,""Digital Pianos""}",24.0,Kurzweil,kurzweil,Music,music,active
161843,12958.68,15680.00,CZ,4,2019-11-25 11:43:38.696209,680206,2019-11-25,26.4680,eshop,,...,13.0,"{1,2014}",{},"{""Muziker Merch"",""Merch - Bags""}",24.0,Muziker,muziker,General,general,inactive
161844,227.73,271.00,DE,6,2019-11-25 12:11:09.965309,681074,2019-11-25,1.0000,eshop,,...,3.0,"{1182,1262}",{},"{Bikes,Skateboarding,Hoverboards}",24.0,Xiaomi,xiaomi,Bike,bike,active
161845,288.43,349.00,BE,6,2019-11-25 12:11:37.940471,681081,2019-11-25,1.0000,eshop,,...,3.0,"{1182,1189}",{},"{Bikes,Scooters,""Electric Scooters""}",24.0,MegaWheels,megawheels,Bike,bike,active
