In [3]:
import pandas as pd
import numpy as np
import sqlite3
import sqlalchemy
import random
from datetime import datetime, timedelta
import os

Create synthetic dataset 

In [4]:
# Define the products and regions
products = ['Product A', 'Product B', 'Product C', 'Product D', 'Product E']
regions = ['Region 1', 'Region 2', 'Region 3', 'Region 4', 'Region 5']

# Set the random seed
random.seed(123)

# Generate random sales data
sales_data = []
start_date = datetime(2022, 1, 1)
end_date = datetime(2022, 12, 31)
for i in range(1000):
    product = random.choice(products)
    region = random.choice(regions)
    sale_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
    sales = np.round(random.uniform(1000, 10000), 2)
    sales_data.append([product, region, sale_date, sales])

# Convert the sales data to a Pandas DataFrame
columns = ['Product', 'Region', 'Sale Date', 'Sales']
df = pd.DataFrame(sales_data, columns=columns)

# Save the DataFrame to a CSV file
df.to_csv('sales_data.csv', index=False)


In [5]:
# Define the name of the database and the name of the table
database = 'data/sales.db'
table = 'sales_data'

# Create the data folder if it doesn't exist
if not os.path.exists('data'):
    os.makedirs('data')

# Connect to the database
conn = sqlite3.connect(database)

# Check if the table already exists
cursor = conn.cursor()
cursor.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table}'")
table_exists = cursor.fetchone() is not None

# If the table doesn't exist, create it
if not table_exists:
    df.to_sql(table, conn, index=False)

# If the table already exists, append the new data to it
else:
    df.to_sql(table, conn, index=False, if_exists='append')

# Close the connection
conn.close()


In [6]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [7]:
%sql sqlite:///data/sales.db

'Connected: @data/sales.db'

In [8]:
%%sql 

select * from sales_data limit 10

 * sqlite:///data/sales.db
Done.


Product,Region,Sale Date,Sales
Product A,Region 3,2022-02-14 00:00:00,7920.61
Product C,Region 1,2022-01-20 00:00:00,4412.12
Product E,Region 3,2022-06-24 00:00:00,8668.78
Product B,Region 2,2022-06-22 00:00:00,6048.66
Product B,Region 2,2022-01-01 00:00:00,9153.15
Product A,Region 5,2022-07-13 00:00:00,1628.89
Product C,Region 4,2022-02-22 00:00:00,9148.59
Product A,Region 2,2022-03-06 00:00:00,8110.65
Product A,Region 3,2022-08-09 00:00:00,6160.67
Product C,Region 4,2022-01-19 00:00:00,7919.15


In [10]:
%%sql
select product, sum(sales)
from sales_data
where region = 'Region 1'
group by product

 * sqlite:///data/sales.db
Done.


Product,sum(sales)
Product A,220938.53
Product B,230473.58
Product C,260562.74000000005
Product D,217392.97
Product E,231932.2200000001
