In [18]:
pip install psycopg2-binary

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.10
Note: you may need to restart the kernel to use updated packages.


In [22]:
import pandas as pd
import random
import numpy as np
from sqlalchemy import create_engine

def create_mock_data(num_rows=1000, start_date='2025-02-14', end_date='2025-02-19'):
    """
    Generate a DataFrame with mock data over a five-day period.
    
    Columns:
      - id: Sequential identifier.
      - created_on: Random timestamps within the specified range.
      - some_numeric: A float column.
      - name: A simple text column.
      - col_mixed: Mostly integers; about 20% of rows contain the string "error".
      - col_list: With 50% probability, a string representation of a list; otherwise, a plain string.
    """
    start_ts = pd.Timestamp(start_date)
    end_ts = pd.Timestamp(end_date)
    random.seed(42)
    np.random.seed(42)
    random_seconds = np.random.randint(
        start_ts.value // 10**9,
        end_ts.value // 10**9 + 1,
        size=num_rows
    )
    created_on = pd.to_datetime(random_seconds, unit='s')
    ids = np.arange(1, num_rows + 1)
    
    def random_mixed_value():
        return random.randint(1, 100) if random.random() < 0.8 else "error"
    col_mixed = [random_mixed_value() for _ in range(num_rows)]
    
    def random_list_value():
        return str([random.randint(1, 10), random.randint(1, 10)]) if random.random() < 0.5 else "single_value"
    col_list = [random_list_value() for _ in range(num_rows)]
    
    df = pd.DataFrame({
        "id": ids,
        "created_on": created_on,
        "some_numeric": np.random.randn(num_rows),
        "name": [f"Name_{i}" for i in ids],
        "col_mixed": col_mixed,
        "col_list": col_list
    })
    df.sort_values("created_on", inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df
    

In [23]:
df = create_mock_data(num_rows=10, start_date='2025-02-25', end_date='2025-02-27')
df.info()
df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id            10 non-null     int64         
 1   created_on    10 non-null     datetime64[ns]
 2   some_numeric  10 non-null     float64       
 3   name          10 non-null     object        
 4   col_mixed     10 non-null     object        
 5   col_list      10 non-null     object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 612.0+ bytes


Unnamed: 0,id,created_on,some_numeric,name,col_mixed,col_list
0,7,2025-02-25 15:14:46,-2.612549,Name_7,12,"[2, 7]"
1,10,2025-02-26 00:18:18,-1.523876,Name_10,84,single_value
2,4,2025-02-26 04:48:14,-0.52517,Name_4,95,single_value
3,6,2025-02-26 06:37:48,-0.924083,Name_6,55,single_value
4,5,2025-02-26 09:17:59,-0.57138,Name_5,error,"[3, 4]"
5,1,2025-02-26 09:52:38,0.279041,Name_1,4,single_value
6,3,2025-02-26 12:38:52,-0.580878,Name_3,95,"[1, 3]"
7,8,2025-02-26 14:08:57,0.95037,Name_8,65,"[6, 10]"
8,2,2025-02-26 16:47:47,1.010515,Name_2,32,"[8, 10]"
9,9,2025-02-26 22:44:26,0.816445,Name_9,72,"[1, 8]"


In [27]:
# Step 2: Write mock data to PostgreSQL.
connection_str = "postgresql+psycopg2://testuser:testpassword@postgres:5432/testdb"
engine = create_engine(connection_str)
table_name='mockdata'
df.to_sql(table_name, engine, if_exists="replace", index=False)

10

In [32]:
query = f"""
            SELECT *
            FROM mockdata
        """

df2 = pd.read_sql_query(query, engine)
df2.to_parquet('test.parquet')

In [33]:
df3=pd.read_parquet('test.parquet')
df3.info()
df3.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id            10 non-null     int64         
 1   created_on    10 non-null     datetime64[ns]
 2   some_numeric  10 non-null     float64       
 3   name          10 non-null     object        
 4   col_mixed     10 non-null     object        
 5   col_list      10 non-null     object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 612.0+ bytes


Unnamed: 0,id,created_on,some_numeric,name,col_mixed,col_list
0,7,2025-02-25 15:14:46,-2.612549,Name_7,12,"[2, 7]"
1,10,2025-02-26 00:18:18,-1.523876,Name_10,84,single_value
2,4,2025-02-26 04:48:14,-0.52517,Name_4,95,single_value
3,6,2025-02-26 06:37:48,-0.924083,Name_6,55,single_value
4,5,2025-02-26 09:17:59,-0.57138,Name_5,error,"[3, 4]"
5,1,2025-02-26 09:52:38,0.279041,Name_1,4,single_value
6,3,2025-02-26 12:38:52,-0.580878,Name_3,95,"[1, 3]"
7,8,2025-02-26 14:08:57,0.95037,Name_8,65,"[6, 10]"
8,2,2025-02-26 16:47:47,1.010515,Name_2,32,"[8, 10]"
9,9,2025-02-26 22:44:26,0.816445,Name_9,72,"[1, 8]"
