# Performance Bottlenecks

## Naive Distinct Value Tracking

In [4]:
import time
import random

def process_ids(ids):
    unique_ids = []
    duplicates = 0
    for id_val in ids:
        if id_val in unique_ids:
            duplicates += 1
        else:
            unique_ids.append(id_val)
    return unique_ids, duplicates

def main():
    # Simulate 100,000 IDs with some duplicates
    random.seed(42)
    ids = [random.randint(1, 100000) for _ in range(100000)]

    start_time = time.time()
    unique, dups = process_ids(ids)
    end_time = time.time()

    print(f"Found {len(unique)} unique IDs and {dups} duplicates in {end_time - start_time:.2f} seconds")

if __name__ == "__main__":
    main()

Found 63054 unique IDs and 36946 duplicates in 6.66 seconds


Note when I got an AI-generated recommendation ho

In [7]:
import time
import random

def process_ids(ids):
    seen_ids = set()
    unique_ids = []
    duplicates = 0
    for id_val in ids:
        if id_val in seen_ids:  # Efficient O(1) lookup on set
            duplicates += 1
        else:
            seen_ids.add(id_val)
            unique_ids.append(id_val)  # Maintain order if needed
    return unique_ids, duplicates

def main():
    # Simulate 100,000 IDs with some duplicates
    random.seed(42)
    ids = [random.randint(1, 100000) for _ in range(100000)]

    start_time = time.time()
    unique, dups = process_ids(ids)
    end_time = time.time()

    print(f"Found {len(unique)} unique IDs and {dups} duplicates in {end_time - start_time:.2f} seconds")

if __name__ == "__main__":
    main()

Found 63054 unique IDs and 36946 duplicates in 0.00 seconds


The `seen_ids` list was completely redundant to the `seen_ids`.

In [6]:
import time
import random

def process_ids(ids):
    unique_ids = set()
    duplicates = 0
    for id_val in ids:
        if id_val in unique_ids:  # Efficient O(1) lookup on set
            duplicates += 1
        else:
            unique_ids.add(id_val)
    return unique_ids, duplicates

def main():
    # Simulate 100,000 IDs with some duplicates
    random.seed(42)
    ids = [random.randint(1, 100000) for _ in range(100000)]

    start_time = time.time()
    unique, dups = process_ids(ids)
    end_time = time.time()

    print(f"Found {len(unique)} unique IDs and {dups} duplicates in {end_time - start_time:.2f} seconds")

if __name__ == "__main__":
    main()

Found 63054 unique IDs and 36946 duplicates in 0.00 seconds


In [8]:
def process_users(user_data):
    # Simulate a large dataset of users
    filtered_users = []


    for user in user_data:
        # Check if user is active and has a valid email
        email = user['email']
        is_active = user['active']

        #  string search for email validation
        if is_active and '@' in email:
            # checking duplicates
            for existing_user in filtered_users:
                if existing_user['email'] == email:
                    break
            else:

                processed_user = {
                    'email': email.lower() + '_processed',
                    'name': user['name'].upper(),
                    'id': str(user['id'])
                }
                filtered_users.append(processed_user)

    return filtered_users

def main():
    # Simulate a large dataset
    users = [
        {'id': i, 'email': f'user{i}@example.com', 'name': f'User {i}', 'active': i % 2 == 0}
        for i in range(100000)
    ]

    # Process the users
    result = process_users(users)
    print(f"Processed {len(result)} users")

if __name__ == "__main__":
    main()

Processed 50000 users


In [10]:
import re

def process_users(user_data):
    filtered_users = []
    seen_emails = set()  # Track duplicates

    for user in user_data:
        email = user['email']
        if user['active'] and '@' in email and email not in seen_emails:
            processed_user = {
                'email': email.lower() + '_processed',
                'name': user['name'].upper(),
                'id': str(user['id'])
            }
            filtered_users.append(processed_user)
            seen_emails.add(email)  # Mark as seen

    return filtered_users


def main():
    # Simulate a large dataset
    users = [
        {'id': i, 'email': f'user{i}@example.com', 'name': f'User {i}', 'active': i % 2 == 0}
        for i in range(100000)
    ]

    # Process the users
    result = process_users(users)
    print(f"Processed {len(result)} users")

if __name__ == "__main__":
    main()

Processed 50000 users


In [5]:
import pandas as pd
import sqlite3

conn = sqlite3.connect("company_operations.db")

sql = """
SELECT
CUSTOMER_ID,
ORDER_DATE,
QUANTITY,
(SELECT AVG(QUANTITY)
 FROM CUSTOMER_ORDER co3
 WHERE co3.CUSTOMER_ID = co1.CUSTOMER_ID) as avg_customer_quantity
FROM CUSTOMER_ORDER co1
ORDER BY ORDER_DATE
"""

pd.read_sql(sql, conn)

Unnamed: 0,CUSTOMER_ID,ORDER_DATE,QUANTITY,avg_customer_quantity
0,9,2021-01-01,20,110.970149
1,5,2021-01-01,110,104.700855
2,3,2021-01-01,120,103.451327
3,6,2021-01-01,200,100.977444
4,2,2021-01-01,60,103.423423
...,...,...,...,...
1185,9,2021-03-31,70,110.970149
1186,5,2021-03-31,140,104.700855
1187,10,2021-03-31,80,100.080000
1188,9,2021-03-31,20,110.970149


In [6]:
import pandas as pd
import sqlite3

conn = sqlite3.connect("company_operations.db")

sql = """
SELECT
CUSTOMER_ID,
ORDER_DATE,
QUANTITY,
AVG(co1.QUANTITY * 1.0) OVER (PARTITION BY co1.CUSTOMER_ID)
FROM CUSTOMER_ORDER co1
ORDER BY ORDER_DATE
"""

pd.read_sql(sql, conn)

Unnamed: 0,CUSTOMER_ID,ORDER_DATE,QUANTITY,AVG(co1.QUANTITY * 1.0) OVER (PARTITION BY co1.CUSTOMER_ID)
0,2,2021-01-01,60,103.423423
1,2,2021-01-01,10,103.423423
2,2,2021-01-01,60,103.423423
3,3,2021-01-01,120,103.451327
4,3,2021-01-01,10,103.451327
...,...,...,...,...
1185,10,2021-03-31,30,100.080000
1186,10,2021-03-31,50,100.080000
1187,10,2021-03-31,90,100.080000
1188,10,2021-03-31,120,100.080000
