In [168]:
# Importing libraries

import sqlite3
import pandas as pd
import numpy as np
import json
import random


## Step 1: Extract

### 1 - Customer Profiles from SQLite database: company_data.db

In [169]:
# Create SQLite database
conn = sqlite3.connect('company_data.db')
cursor = conn.cursor()

In [170]:
# List all tables
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")

tables = cursor.fetchall()
print("Tables in database:", tables)

Tables in database: [('Customers',), ('unified_data',)]


In [171]:
# Save customers table into data frame
df_customers = pd.read_sql_query("SELECT * FROM customers", conn)
df_customers

Unnamed: 0,CustId,FirstName,LastName,Age,Country
0,45000,Sipho,Naidoo,20,South Africa
1,45001,Michael,Smith,37,USA
2,45002,Palesa,Petersen,40,South Africa
3,45003,David,Smith,20,USA
4,45004,Noah,Taylor,27,Australia
...,...,...,...,...,...
115,45115,Sophie,Jones,32,Australia
116,45116,Sipho,Naidoo,39,South Africa
117,45117,Amelia,Wilson,30,England
118,45118,Charlotte,Evans,32,England


Query on the company_data.db (Optional - but recommended to get more pracice)

In [172]:
pd.read_sql_query("""
                  SELECT FirstName, Age FROM customers
                  WHERE Age>39""", conn)

Unnamed: 0,FirstName,Age
0,Palesa,40
1,Ella,40
2,Camille,40
3,Naledi,40
4,Sipho,40
5,Chloé,40
6,Thabo,40


### 2 - Read csv file: "transactions.csv"

In [173]:
df_transactions = pd.read_csv("transactions.csv")
df_transactions

Unnamed: 0,transaction_id,customer_id,product,amount,date
0,1,45049,Tablet,672,2025-01-01
1,2,45091,Laptop,300,2025-01-02
2,3,45095,Headphones,429,2025-01-03
3,4,45062,Phone,930,2025-01-04
4,5,45014,Laptop,486,2025-01-05
...,...,...,...,...,...
295,296,45086,Tablet,541,2025-10-23
296,297,45058,Headphones,270,2025-10-24
297,298,45040,Tablet,220,2025-10-25
298,299,45053,Tablet,689,2025-10-26


### 3 - Read JSON file: "feedback.json"

In [174]:
# Read JSON file into DataFrame
with open('feedback.json') as f:
    data = json.load(f)
df_feedback = pd.DataFrame(data)

print("JSON Feedback Sample:")
print(df_feedback.head(), "\n")

JSON Feedback Sample:
   custID  rating               feedback
0   45000       2      Excellent service
1   45001       1                Average
2   45002       1                Average
3   45003       3                Average
4   45004       1  Product quality issue 



## Step 2: Transform

In [175]:
# Clean and align column names
df_customers.rename(columns={'CustId': 'customer_id'}, inplace=True)
df_customers

df_feedback.rename(columns={'custID': 'customer_id'}, inplace=True)
df_feedback

Unnamed: 0,customer_id,rating,feedback
0,45000,2,Excellent service
1,45001,1,Average
2,45002,1,Average
3,45003,3,Average
4,45004,1,Product quality issue
...,...,...,...
116,45116,4,Excellent service
117,45117,2,Average
118,45118,5,Delivery was late
119,45119,3,Excellent service


In [176]:
df_customers

Unnamed: 0,customer_id,FirstName,LastName,Age,Country
0,45000,Sipho,Naidoo,20,South Africa
1,45001,Michael,Smith,37,USA
2,45002,Palesa,Petersen,40,South Africa
3,45003,David,Smith,20,USA
4,45004,Noah,Taylor,27,Australia
...,...,...,...,...,...
115,45115,Sophie,Jones,32,Australia
116,45116,Sipho,Naidoo,39,South Africa
117,45117,Amelia,Wilson,30,England
118,45118,Charlotte,Evans,32,England


In [177]:
# Merge all sources
df_merged = (
    df_customers
    .merge(df_transactions, on='customer_id', how='left')
    .merge(df_feedback, on='customer_id', how='left')
)

In [178]:
# Clean and handle missing values
df_merged['feedback'] = df_merged['feedback'].fillna('No feedback')
df_merged['rating'] = df_merged['rating'].fillna(0).astype(int)

## Step 3: Load

In [179]:
#  Load Unified Data into SQLite
df_merged.to_sql('unified_data', conn, if_exists='replace', index=False)

df_merged

Unnamed: 0,customer_id,FirstName,LastName,Age,Country,transaction_id,product,amount,date,rating,feedback
0,45000,Sipho,Naidoo,20,South Africa,41.0,Laptop,786.0,2025-02-10,2,Excellent service
1,45000,Sipho,Naidoo,20,South Africa,72.0,Tablet,872.0,2025-03-13,2,Excellent service
2,45000,Sipho,Naidoo,20,South Africa,89.0,Phone,1027.0,2025-03-30,2,Excellent service
3,45000,Sipho,Naidoo,20,South Africa,91.0,Laptop,201.0,2025-04-01,2,Excellent service
4,45001,Michael,Smith,37,USA,,,,,1,Average
...,...,...,...,...,...,...,...,...,...,...,...
301,45117,Amelia,Wilson,30,England,142.0,Phone,863.0,2025-05-22,2,Average
302,45118,Charlotte,Evans,32,England,102.0,Camera,554.0,2025-04-12,5,Delivery was late
303,45118,Charlotte,Evans,32,England,213.0,Phone,971.0,2025-08-01,5,Delivery was late
304,45119,Leon,Hoffmann,34,Germany,140.0,Tablet,1176.0,2025-05-20,3,Excellent service


Query on the unified dataset

In [180]:
query1 = """
SELECT country, COUNT(DISTINCT customer_id) as num_customers,
       ROUND(AVG(amount),2) as avg_spent
FROM unified_data
GROUP BY country
ORDER BY avg_spent DESC
"""
print("Average Spending by Country:")
print(pd.read_sql_query(query1, conn))

Average Spending by Country:
        Country  num_customers  avg_spent
0     Sri Lanka              8     776.72
1         Italy             11     732.03
2        France             17     686.86
3       Germany             14     666.45
4  South Africa             27     654.99
5           USA             13     642.52
6       England             16     571.47
7     Australia             14     549.51


In [181]:
query2 = """
SELECT rating, COUNT(*) as num_feedbacks,
       ROUND(AVG(amount),2) as avg_amount
FROM unified_data
GROUP BY rating
ORDER BY rating DESC
"""
print("Customer Ratings vs Average Transaction Amount:")
print(pd.read_sql_query(query2, conn), "\n")

Customer Ratings vs Average Transaction Amount:
   rating  num_feedbacks  avg_amount
0       5             97      654.24
1       4             41      667.88
2       3             47      698.49
3       2             56      638.58
4       1             65      610.05 



### Exporting integrated (unified) dataset

In [182]:
# Export Integrated Dataset
df_merged.to_csv('unified_dataset.csv', index=False)
print("Integrated dataset exported to unified_dataset.csv")

Integrated dataset exported to unified_dataset.csv


In [183]:
# Close the connection
conn.close()