In [9]:
import sqlite3
import random
import json
import pandas as pd
import numpy as np

### Simulate Source 1: Customer.db

In [10]:
# Define sample first and last names by country
data = {
    "USA": {
        "first": ["James", "Emily", "Michael", "Sarah", "David", "Ashley", "Robert", "Jessica", "William", "Olivia"],
        "last": ["Smith", "Johnson", "Brown", "Williams", "Jones", "Miller", "Davis", "Garcia", "Taylor", "Anderson"]
    },
    "England": {
        "first": ["Oliver", "Amelia", "Harry", "Isla", "George", "Sophia", "Jack", "Charlotte", "Charlie", "Ella"],
        "last": ["Wilson", "Thompson", "Evans", "Roberts", "Walker", "White", "Lewis", "Hall", "Allen", "Young"]
    },
    "Sri Lanka": {
        "first": ["Kasun", "Nadeesha", "Sanduni", "Isuru", "Chathura", "Hansika", "Tharindu", "Kavindi", "Supun", "Rashmi"],
        "last": ["Perera", "Fernando", "Silva", "Wijesinghe", "Jayasinghe", "Ekanayake", "Rathnayake", "Gunasekara", "Wickramasinghe", "Bandara"]
    },
    "Australia": {
        "first": ["Liam", "Chloe", "Noah", "Mia", "Ethan", "Zoe", "Lucas", "Grace", "Jack", "Sophie"],
        "last": ["Smith", "Jones", "Williams", "Taylor", "Brown", "Wilson", "Johnson", "Martin", "Lee", "Walker"]
    },
    "South Africa": {
        "first": ["Thabo", "Naledi", "Sipho", "Lerato", "Kagiso", "Palesa", "Sibusiso", "Ayanda", "Karabo", "Boitumelo"],
        "last": ["Nkosi", "Naidoo", "Petersen", "Botha", "Mthembu", "Mokoena", "Van Wyk", "Jansen", "De Villiers", "Khumalo"]
    },
    "Germany": {
        "first": ["Lukas", "Mia", "Leon", "Hannah", "Jonas", "Lea", "Paul", "Emma", "Felix", "Sophie"],
        "last": ["Müller", "Schmidt", "Schneider", "Fischer", "Weber", "Meyer", "Wagner", "Becker", "Hoffmann", "Koch"]
    },
    "France": {
        "first": ["Lucas", "Emma", "Hugo", "Chloé", "Louis", "Camille", "Gabriel", "Manon", "Nathan", "Léa"],
        "last": ["Martin", "Bernard", "Dubois", "Thomas", "Robert", "Richard", "Petit", "Durand", "Leroy", "Moreau"]
    },
    "Italy": {
        "first": ["Luca", "Giulia", "Marco", "Francesca", "Matteo", "Chiara", "Alessandro", "Sara", "Davide", "Martina"],
        "last": ["Rossi", "Russo", "Ferrari", "Esposito", "Bianchi", "Romano", "Colombo", "Ricci", "Marino", "Greco"]
    }
}

In [11]:
customer_data = []
countries = list(data.keys())
for i in range(120):
    cust_id = 45000+i
    country = random.choice(countries)
    first_name = random.choice(data[country]["first"])
    last_name = random.choice(data[country]["last"])
    age = random.randint(18,40)
    customer_data.append({
        "CustId": cust_id,
        "FirstName": first_name,
        "LastName": last_name,
        "Age": age,
        "Country": country
    })

df = pd.DataFrame(customer_data)
df

Unnamed: 0,CustId,FirstName,LastName,Age,Country
0,45000,Chathura,Rathnayake,24,Sri Lanka
1,45001,Zoe,Lee,37,Australia
2,45002,Léa,Martin,22,France
3,45003,David,Anderson,36,USA
4,45004,Jessica,Miller,26,USA
...,...,...,...,...,...
115,45115,Naledi,Van Wyk,32,South Africa
116,45116,Mia,Brown,26,Australia
117,45117,Emily,Jones,28,USA
118,45118,Chloé,Petit,28,France


In [12]:
# Create SQLite database
conn = sqlite3.connect('company_data.db')
cursor = conn.cursor()

# Create the "customers" table
cursor.execute('''
CREATE TABLE IF NOT EXISTS customers (
    customer_id INTEGER PRIMARY KEY,
    first_name TEXT,
    last_name TEXT,
    age INTEGER,
    country TEXT
)
''')
conn.commit()

In [13]:
# Save DataFrame into a new table
df.to_sql("Customers", conn, if_exists="replace", index=False)
conn.commit()

In [14]:
# Close connection
conn.close()

### Simulate Source 2: CSV File with Transactions

In [15]:
import pandas as pd
import numpy as np

# Parameters
customer_ids = range(45000, 45120)
total_transactions = 300

# Step 1: Start with one guaranteed transaction per customer
base_transactions = pd.DataFrame({
    'customer_id': customer_ids,
    'transaction_id': range(1, len(customer_ids) + 1),
    'product': np.random.choice(['Laptop', 'Phone', 'Headphones', 'Camera', 'Tablet'], len(customer_ids)),
    'amount': np.random.randint(50, 1200, len(customer_ids)),
    'date': pd.date_range('2025-01-01', periods=len(customer_ids), freq='D')
})

# Step 2: Add extra random transactions (to reach total 300)
extra_count = total_transactions - len(customer_ids)
extra_transactions = pd.DataFrame({
    'customer_id': np.random.choice(customer_ids, extra_count, replace=True),
    'transaction_id': range(len(customer_ids) + 1, total_transactions + 1),
    'product': np.random.choice(['Laptop', 'Phone', 'Headphones', 'Camera', 'Tablet'], extra_count),
    'amount': np.random.randint(50, 1200, extra_count),
    'date': pd.date_range('2025-05-01', periods=extra_count, freq='D')
})


df_transactions = pd.concat([base_transactions, extra_transactions], ignore_index=True)
df_transactions.to_csv('transactions.csv', index=False)

print("CSV Transactions Sample:")
df_transactions

CSV Transactions Sample:


Unnamed: 0,customer_id,transaction_id,product,amount,date
0,45000,1,Headphones,788,2025-01-01
1,45001,2,Headphones,76,2025-01-02
2,45002,3,Tablet,209,2025-01-03
3,45003,4,Headphones,277,2025-01-04
4,45004,5,Laptop,996,2025-01-05
...,...,...,...,...,...
295,45058,296,Laptop,490,2025-10-23
296,45004,297,Camera,224,2025-10-24
297,45097,298,Tablet,816,2025-10-25
298,45059,299,Phone,651,2025-10-26


### Simulate Source 3: JSON File with Feedback

In [17]:
feedback_data = []
for cid in range(45000, 45121):
    feedback_data.append({
        "custID": cid,
        "rating": random.choice([1, 2, 3, 4, 5]),
        "feedback": random.choice([
            "Excellent service", "Good experience", "Average",
            "Delivery was late", "Product quality issue"
        ])
    })

with open('feedback.json', 'w') as f:
    json.dump(feedback_data, f, indent=4)