# Building Fast Queries on a CSV

We will imagine that we own an online laptop store and want to build a way to answer a few different business questions about our inventory.

We will use the `laptops.csv` file as our inventory. This CSV file was adapted from the Laptop Prices dataset on [Kaggle](https://www.kaggle.com/ionaskel/laptop-prices).

## The dataset

In [11]:
import csv
with open("laptops.csv", "r") as ifile:
    reader = csv.reader(ifile)
    laptops = list(reader)
    header, rows = laptops[0], laptops[1:]

In [12]:
print(header)

['Id', 'Company', 'Product', 'TypeName', 'Inches', 'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price']


In [13]:
print(*rows[:5], sep="\n")

['6571244', 'Apple', 'MacBook Pro', 'Ultrabook', '13.3', 'IPS Panel Retina Display 2560x1600', 'Intel Core i5 2.3GHz', '8GB', '128GB SSD', 'Intel Iris Plus Graphics 640', 'macOS', '1.37kg', '1339']
['7287764', 'Apple', 'Macbook Air', 'Ultrabook', '13.3', '1440x900', 'Intel Core i5 1.8GHz', '8GB', '128GB Flash Storage', 'Intel HD Graphics 6000', 'macOS', '1.34kg', '898']
['3362737', 'HP', '250 G6', 'Notebook', '15.6', 'Full HD 1920x1080', 'Intel Core i5 7200U 2.5GHz', '8GB', '256GB SSD', 'Intel HD Graphics 620', 'No OS', '1.86kg', '575']
['9722156', 'Apple', 'MacBook Pro', 'Ultrabook', '15.4', 'IPS Panel Retina Display 2880x1800', 'Intel Core i7 2.7GHz', '16GB', '512GB SSD', 'AMD Radeon Pro 455', 'macOS', '1.83kg', '2537']
['8550527', 'Apple', 'MacBook Pro', 'Ultrabook', '13.3', 'IPS Panel Retina Display 2560x1600', 'Intel Core i5 3.1GHz', '8GB', '256GB SSD', 'Intel Iris Plus Graphics 650', 'macOS', '1.37kg', '1803']


## Inventory Class

In [14]:
class Inventory:
    def __init__(self, csv_file):
        with open(csv_file) as ifile:
            reader = csv.reader(ifile)
            rows = list(reader)
        self.header = rows[0]
        self.rows = rows[1:]

In [15]:
inventory = Inventory("laptops.csv")
print(inventory.header)
print(len(inventory.rows))

['Id', 'Company', 'Product', 'TypeName', 'Inches', 'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price']
1303


## Finding a Laptop from the Id

In [16]:
class Inventory:
    def __init__(self, csv_file):
        with open(csv_file) as ifile:
            reader = csv.reader(ifile)
            rows = list(reader)
        self.header = rows[0]
        self.rows = rows[1:]
        for row in self.rows:
            # Convert prices to int for every row
            row[-1] = int(row[-1])
        
    def get_laptop_from_id(self, laptop_id):
        for row in self.rows:
            if row[0] == laptop_id:
                return row
        return None
                

In [18]:
inventory = Inventory("laptops.csv")
id1 = inventory.get_laptop_from_id("3362737")
id2 = inventory.get_laptop_from_id("3362736")
print(id1)
print(id2)

['3362737', 'HP', '250 G6', 'Notebook', '15.6', 'Full HD 1920x1080', 'Intel Core i5 7200U 2.5GHz', '8GB', '256GB SSD', 'Intel HD Graphics 620', 'No OS', '1.86kg', 575]
None


## Improving Id Lookups

In [19]:
class Inventory:
    def __init__(self, csv_file):
        with open(csv_file) as ifile:
            reader = csv.reader(ifile)
            rows = list(reader)
        self.header = rows[0]
        self.rows = rows[1:]
        
        for row in self.rows:
            # Convert prices to int for every row
            row[-1] = int(row[-1])
            
        self.id_to_row = {}
        for row in self.rows:
            self.id_to_row[row[0]] = row
        
    def get_laptop_from_id(self, laptop_id):
        for row in self.rows:
            if row[0] == laptop_id:
                return row
        return None
    
    def get_laptop_from_id_fast(self, laptop_id):
        try:
            row = self.id_to_row[laptop_id]
        except KeyError:
            return None
        return row

In [21]:
inventory = Inventory("laptops.csv")
id1 = inventory.get_laptop_from_id_fast("3362737")
id2 = inventory.get_laptop_from_id_fast("3362736")
print(id1)
print(id2)

['3362737', 'HP', '250 G6', 'Notebook', '15.6', 'Full HD 1920x1080', 'Intel Core i5 7200U 2.5GHz', '8GB', '256GB SSD', 'Intel HD Graphics 620', 'No OS', '1.86kg', 575]
None


## Comparing the Performance

In [23]:
import time
import random

inventory = Inventory("laptops.csv")
total_time_no_dict = 0
total_time_dict = 0

ids = [str(random.randint(1000000, 9999999)) for _ in range(10000)]
for id_ in ids:
    start = time.time()
    inventory.get_laptop_from_id(id_)
    end = time.time()
    total_time_no_dict += (end - start)

for id_ in ids:
    start = time.time()
    inventory.get_laptop_from_id_fast(id_)
    end = time.time()
    total_time_dict += (end - start)
    
print(total_time_no_dict)
print(total_time_dict)

0.7140219211578369
0.00689387321472168


## Two Laptop Promotion

In [33]:
class Inventory:
    def __init__(self, csv_file):
        with open(csv_file) as ifile:
            reader = csv.reader(ifile)
            rows = list(reader)
        self.header = rows[0]
        self.rows = rows[1:]
        
        for row in self.rows:
            # Convert prices to int for every row
            row[-1] = int(row[-1])
            
        self.id_to_row = {}
        for row in self.rows:
            self.id_to_row[row[0]] = row
        
    def get_laptop_from_id(self, laptop_id):
        for row in self.rows:
            if row[0] == laptop_id:
                return row
        return None
    
    def get_laptop_from_id_fast(self, laptop_id):
        try:
            row = self.id_to_row[laptop_id]
        except KeyError:
            return None
        return row
    
    def check_promotion_dollars(self, dollars):
        for row in self.rows:
            if row == dollars:
                return True
        
        for i in range(len(self.rows)):
            for j in range(i, len(self.rows)):
                if (self.rows[i][-1] + self.rows[j][-1] == dollars):
                    return True
        return False

In [32]:
inventory = Inventory("laptops.csv")
c1 = inventory.check_promotion_dollars(1000)
c2 = inventory.check_promotion_dollars(442)
print(c1)
print(c2)

True
False


## Optimizing Laptop Promotion

In [38]:
class Inventory:
    def __init__(self, csv_file):
        self.prices = set()
        with open(csv_file) as ifile:
            reader = csv.reader(ifile)
            rows = list(reader)
        self.header = rows[0]
        self.rows = rows[1:]
        
        for row in self.rows:
            # Convert prices to int for every row
            row[-1] = int(row[-1])
            self.prices.add(row[-1])
            
        self.id_to_row = {}
        for row in self.rows:
            self.id_to_row[row[0]] = row
        
    def get_laptop_from_id(self, laptop_id):
        for row in self.rows:
            if row[0] == laptop_id:
                return row
        return None
    
    def get_laptop_from_id_fast(self, laptop_id):
        try:
            row = self.id_to_row[laptop_id]
        except KeyError:
            return None
        return row
    
    def check_promotion_dollars(self, dollars):
        for row in self.rows:
            if row == dollars:
                return True
        
        for i in range(len(self.rows)):
            for j in range(i, len(self.rows)):
                if (self.rows[i][-1] + self.rows[j][-1] == dollars):
                    return True
        return False
    
    def check_promotion_dollars_fast(self, dollars):
        if dollars in self.prices:
            return True
        for price in self.prices:
            if dollars - price in self.prices:
                return True
        return False

In [39]:
inventory = Inventory("laptops.csv")
c1 = inventory.check_promotion_dollars_fast(1000)
c2 = inventory.check_promotion_dollars_fast(442)
print(c1)
print(c2)

True
False


## Comparing Promotion Functions

In [44]:
prices = [random.randint(100, 5000) for _ in range(100)]
total_time_no_set = 0
total_time_set = 0
inventory = Inventory("laptops.csv")

for price in prices:
    start = time.time()
    inventory.check_promotion_dollars(price)
    end = time.time()
    total_time_no_set += (end - start)
    
for price in prices:
    start = time.time()
    inventory.check_promotion_dollars_fast(price)
    end = time.time()
    total_time_set += (end - start)
    
print(total_time_no_set)
print(total_time_set)
print("factor = ", total_time_no_set / total_time_set)

1.3113911151885986
0.0005054473876953125
factor =  2594.515566037736


## Finding Laptops within a Budget

In [75]:
class Inventory:
    def __init__(self, csv_file):
        self.prices = set()
        with open(csv_file) as ifile:
            reader = csv.reader(ifile)
            rows = list(reader)
        self.header = rows[0]
        self.rows = rows[1:]
        
        for row in self.rows:
            # Convert prices to int for every row
            row[-1] = int(row[-1])
            self.prices.add(row[-1])
            
        self.id_to_row = {}
        for row in self.rows:
            self.id_to_row[row[0]] = row
        
    def get_laptop_from_id(self, laptop_id):
        for row in self.rows:
            if row[0] == laptop_id:
                return row
        return None
    
    def get_laptop_from_id_fast(self, laptop_id):
        try:
            row = self.id_to_row[laptop_id]
        except KeyError:
            return None
        return row
    
    def check_promotion_dollars(self, dollars):
        for row in self.rows:
            if row == dollars:
                return True
        
        for i in range(len(self.rows)):
            for j in range(i, len(self.rows)):
                if (self.rows[i][-1] + self.rows[j][-1] == dollars):
                    return True
        return False
    
    def check_promotion_dollars_fast(self, dollars):
        if dollars in self.prices:
            return True
        for price in self.prices:
            if dollars - price in self.prices:
                return True
        return False
    
    # Returns max index of laptop record in the sorted laptops list such that
    # records till that index falls within budget
    def find_index_budget(self, budget):
        sorted_rows = sorted(self.rows, key=lambda row: row[-1])
        if (sorted_rows[-1][-1] <= budget):
            return len(sorted_rows) - 1
        lb = 0
        ub = len(sorted_rows) - 1
        while lb <= ub:
            mid = (lb + ub) // 2
            mid_val = sorted_rows[mid][-1]
            mid_prev_val = sorted_rows[mid - 1][-1]
            if (mid_val >= budget and mid - 1 >= 0 and mid_prev_val < budget):
                return mid
            elif (sorted_rows[mid][-1] < budget):
                lb = mid + 1
            else:
                ub = mid -1
        return -1

In [76]:
inventory = Inventory("laptops.csv")
i1 = inventory.find_index_budget(1000)
i2 = inventory.find_index_budget(10000)
print(i1)
print(i2)

682
1302
