# Performance Bottlenecks

## Naive Distinct Value Tracking

In [None]:
import time
import random

def process_ids(ids):
    unique_ids = []
    duplicates = 0
    for id_val in ids:
        if id_val in unique_ids:
            duplicates += 1
        else:
            unique_ids.append(id_val)
    return unique_ids, duplicates

def main():
    # Simulate 100,000 IDs with some duplicates
    random.seed(42)
    ids = [random.randint(1, 100000) for _ in range(100000)]

    start_time = time.time()
    unique, dups = process_ids(ids)
    end_time = time.time()

    print(f"Found {len(unique)} unique IDs and {dups} duplicates in {end_time - start_time:.2f} seconds")

if __name__ == "__main__":
    main()

Note when I got an AI-generated recommendation, it made some unnecessary code.

In [None]:
import time
import random

def process_ids(ids):
    seen_ids = set()
    unique_ids = []
    duplicates = 0
    for id_val in ids:
        if id_val in seen_ids:  # Efficient O(1) lookup on set
            duplicates += 1
        else:
            seen_ids.add(id_val)
            unique_ids.append(id_val)  # Maintain order if needed
    return unique_ids, duplicates

def main():
    # Simulate 100,000 IDs with some duplicates
    random.seed(42)
    ids = [random.randint(1, 100000) for _ in range(100000)]

    start_time = time.time()
    unique, dups = process_ids(ids)
    end_time = time.time()

    print(f"Found {len(unique)} unique IDs and {dups} duplicates in {end_time - start_time:.2f} seconds")

if __name__ == "__main__":
    main()

The `seen_ids` list was completely redundant to the `seen_ids`.

In [None]:
import time
import random

def process_ids(ids):
    unique_ids = set()
    duplicates = 0
    for id_val in ids:
        if id_val in unique_ids:  # Efficient O(1) lookup on set
            duplicates += 1
        else:
            unique_ids.add(id_val)
    return unique_ids, duplicates

def main():
    # Simulate 100,000 IDs with some duplicates
    random.seed(42)
    ids = [random.randint(1, 100000) for _ in range(100000)]

    start_time = time.time()
    unique, dups = process_ids(ids)
    end_time = time.time()

    print(f"Found {len(unique)} unique IDs and {dups} duplicates in {end_time - start_time:.2f} seconds")

if __name__ == "__main__":
    main()

## Nested Loop Bottlenecks

In [None]:
def process_users(user_data):
    # Simulate a large dataset of users
    filtered_users = []


    for user in user_data:
        # Check if user is active and has a valid email
        email = user['email']
        is_active = user['active']

        #  string search for email validation
        if is_active and '@' in email:
            # checking duplicates
            for existing_user in filtered_users:
                if existing_user['email'] == email:
                    break
            else:

                processed_user = {
                    'email': email.lower() + '_processed',
                    'name': user['name'].upper(),
                    'id': str(user['id'])
                }
                filtered_users.append(processed_user)

    return filtered_users

def main():
    # Simulate a large dataset
    users = [
        {'id': i, 'email': f'user{i}@example.com', 'name': f'User {i}', 'active': i % 2 == 0}
        for i in range(100000)
    ]

    # Process the users
    result = process_users(users)
    print(f"Processed {len(result)} users")

if __name__ == "__main__":
    main()

In [None]:
import re

def process_users(user_data):
    filtered_users = []
    seen_emails = set()  # Track duplicates

    for user in user_data:
        email = user['email']
        if user['active'] and '@' in email and email not in seen_emails:
            processed_user = {
                'email': email.lower() + '_processed',
                'name': user['name'].upper(),
                'id': str(user['id'])
            }
            filtered_users.append(processed_user)
            seen_emails.add(email)  # Mark as seen

    return filtered_users


def main():
    # Simulate a large dataset
    users = [
        {'id': i, 'email': f'user{i}@example.com', 'name': f'User {i}', 'active': i % 2 == 0}
        for i in range(100000)
    ]

    # Process the users
    result = process_users(users)
    print(f"Processed {len(result)} users")

if __name__ == "__main__":
    main()

## SQL Optimization

Let's start with a common mistake that is easy to do in Python SQL. Let's say I want to insert the next 10_000 dates into a `CALENDAR` table that is a single column of dates. Why is this so slow?

In [42]:
import sqlite3

for _ in range(10_000):
    conn = sqlite3.connect("company_operations.db")
    cursor = conn.cursor()
    cursor.execute("""INSERT INTO CALENDAR (CALENDAR_DATE) VALUES
                   ((SELECT DATE(MAX(CALENDAR_DATE),'+1 day') FROM CALENDAR))
                   """)
    conn.commit()
    cursor.close()
    conn.close()

In [38]:
import pandas as pd
import sqlite3
import time

conn = sqlite3.connect("company_operations.db")

Let's pretend for a moment that this SQL query below took a very long time. Let's see what it recommends.

In [39]:
sql = """
SELECT * FROM WEATHER_MONITOR
WHERE REPORT_DATE = '2021-05-05'
"""

start = time.time()
pd.read_sql(sql, conn)
end = time.time()
print(f"Query took {end - start:.4f} seconds")

Query took 0.0037 seconds


But keep in mind, if the AI recommends an index and other solutions, there are gotchas like write speed going down substantially. These are nuances that you cannot pick up from vibe coding, and is why you should know the subject matter you are prompting.

Here is another example. Let's try to optimize this query.

In [None]:
import pandas as pd
import sqlite3

conn = sqlite3.connect("company_operations.db")

sql = """
SELECT
CUSTOMER_ID,
ORDER_DATE,
QUANTITY,
(SELECT AVG(QUANTITY)
 FROM CUSTOMER_ORDER co3
 WHERE co3.CUSTOMER_ID = co1.CUSTOMER_ID) as avg_customer_quantity
FROM CUSTOMER_ORDER co1
ORDER BY ORDER_DATE
"""

start = time.time()
pd.read_sql(sql, conn)
end = time.time()
print(f"Query took {end - start:.4f} seconds")

Ideally, we'd like to get a windowing function once the AI recommends a fix. But it may settle for a common table expression or derived table too. This again shows that the AI may propose something that works but may be suboptimal.

## Vectorization

Let's see how we can have AI convert our code to be vectorized for numerical operations. This will make our code much faster and help us learn NumPy equivalents to vanilla python loop operations.

Here is a simple summation of the numbers 1 through 300 million. It takes nearly 10 seconds to do. Let's see how AI suggests we fix it.

In [None]:
total = 0
for i in range(300_000_000):
    total += i

print(total)

Here is what ChatGPT suggested I do. It takes less than a second.

In [None]:
import numpy as np

total = np.sum(np.arange(300_000_000))
print(total)

Calculating this average took nearly 7 seconds.

In [None]:
x = [i for i in range(300_000_000)]

mean = sum(x) / len(x)

print("MEAN: ", mean)

Below, the AI-generated response took less than half a second.

In [None]:
import numpy as np

x = np.arange(300_000_000)
mean = np.mean(x)

print("MEAN: ", mean)

Now here is something painful. This operation takes nearly 30 seconds.

In [None]:
x = list(range(300_000_000))
y = list(range(300_000_000))
z = []
for i in range(len(x)):
    z.append(x[i] + y[i])

But the AI-generated optimization takes less than 10 seconds.

In [None]:
import numpy as np
x = np.arange(300_000_000)
y = np.arange(300_000_000)
z = x + y

In [40]:
import math
data = [i * 0.1 for i in range(1_000_000)]  # Simulated dataset
mean = sum(data) / len(data)
variance = sum((x - mean) ** 2 for x in data) / len(data)
std_dev = math.sqrt(variance)
normalized = []
for val in data:
    normalized.append((val - mean) / std_dev)

In [41]:
import numpy as np
data = np.arange(1_000_000) * 0.1  # Simulated dataset
normalized = (data - np.mean(data)) / np.std(data)

## Computing Pairwise Euclidean Distances for Clustering

In [None]:
import math
points = [[i, i+1] for i in range(10_000)]  # List of [x, y] points
distances = []
for i in range(len(points)):
    row = []
    for j in range(len(points)):
        dx = points[i][0] - points[j][0]
        dy = points[i][1] - points[j][1]
        row.append(math.sqrt(dx**2 + dy**2))
    distances.append(row)

In [None]:
import numpy as np
points = np.arange(10_000)[:, np.newaxis] * np.array([1, 1]) + np.array([0, 1])  # Array of shape (10000, 2)
diffs = points[:, np.newaxis, :] - points[np.newaxis, :, :]
distances = np.sqrt(np.sum(diffs**2, axis=-1))
# Alternatively, for even faster: use scipy.spatial.distance.cdist(points, points)

## Filtering and Aggregating Time-Series Data

In [None]:
prices = [i * 0.01 for i in range(10_000_000)]  # Time-series prices
volumes = [i % 1000 for i in range(10_000_000)]  # Volumes
high_volume_prices = []
for i in range(len(prices)):
    if volumes[i] > 500:
        high_volume_prices.append(prices[i])
avg_price = sum(high_volume_prices) / len(high_volume_prices) if high_volume_prices else 0

In [None]:
import numpy as np
prices = np.arange(10_000_000) * 0.01
volumes = np.arange(10_000_000) % 1000
mask = volumes > 500
high_volume_prices = prices[mask]
avg_price = np.mean(high_volume_prices) if len(high_volume_prices) > 0 else 0