<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1">Imports</a></span><ul class="toc-item"><li><span><a href="#Specify-Tesseract-install-location" data-toc-modified-id="Specify-Tesseract-install-location-1.1">Specify Tesseract install location</a></span></li></ul></li><li><span><a href="#Choose-product-category" data-toc-modified-id="Choose-product-category-2">Choose product category</a></span></li><li><span><a href="#Functions" data-toc-modified-id="Functions-3">Functions</a></span><ul class="toc-item"><li><span><a href="#Download-data-for-all-products" data-toc-modified-id="Download-data-for-all-products-3.1">Download data for all products</a></span></li><li><span><a href="#Download-and-clean-data-for-a-single-product" data-toc-modified-id="Download-and-clean-data-for-a-single-product-3.2">Download and clean data for a single product</a></span></li><li><span><a href="#Scrape-price-data-for-a-single-product" data-toc-modified-id="Scrape-price-data-for-a-single-product-3.3">Scrape price data for a single product</a></span></li><li><span><a href="#Find-the-closest-date-in-a-list" data-toc-modified-id="Find-the-closest-date-in-a-list-3.4">Find the closest date in a list</a></span></li><li><span><a href="#Locate-a-line-in-an-image" data-toc-modified-id="Locate-a-line-in-an-image-3.5">Locate a line in an image</a></span></li><li><span><a href="#Impute-NaN-values-in-price-data" data-toc-modified-id="Impute-NaN-values-in-price-data-3.6">Impute NaN values in price data</a></span></li></ul></li><li><span><a href="#Download-the-dataset" data-toc-modified-id="Download-the-dataset-4">Download the dataset</a></span></li></ul></div>

You don't actually need to run this notebook unless you plan to make your own dataset (e.g., for a different product category).

# Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image, ImageOps
import requests
from io import BytesIO
import skimage
from skimage import io
import difflib
import pytesseract
import datetime
import re
import os.path
from bisect import bisect_left
%matplotlib notebook

## Specify Tesseract install location

You will need to install
[Tesseract OCR](https://www.pyimagesearch.com/2017/07/10/using-tesseract-ocr-python/)
before you can use this code.

In [None]:
# Here, we specify the Tesseract OCR installation location.

pytesseract.pytesseract.tesseract_cmd = r'D:\Software\Tesseract-OCR\tesseract.exe'

# Choose product category

I compiled two datasets: books and board games. 
You can easily create your own list of items in a similar format. 

In [None]:
# Specify the dataset to download, and the download location.
product = 'book_db.csv'
output_folder = 'paperbacks'
#product = 'game_db.csv'
#output_folder = 'boardgames'

os.mkdir(os.path.join('data', output_folder))

# Functions

## Download data for all products

In [None]:
def download_data(product_ids, output_folder):
    for product_id in product_ids:
        df = collect_data(product_id)
        if df is not None:
            df.to_csv(os.path.join('data', output_folder, product_id + '.csv'),
                      index=False)
        else:
            print('id ' + product_id + ' failed')

## Download and clean data for a single product

In [None]:
def collect_data(asin):
    """ 
    Get the price history of an item from Amazon.
  
    Parameters: 
    asin (str): Amazon Standard Identification Number for an item.
  
    Returns: 
    features: numpy array of features of the price history that can 
        be used as inputs to a trained random forest classifier
    history: A pandas dataframe of the item's price at 12-hours
        time resolution, collected from camelcamelcamel.com
    """
    # Set the size of the image to be downloaded. Larger sizes
    # will product more accurate results at the cost of
    # increased processing time.
    image_width = 6555
    image_height = 1013

    # get the URL to the camelcamelcamel page
    url = 'https://charts.camelcamelcamel.com/us/' + asin + \
    '/amazon.png?force=0&zero=1&w='+str(image_width)+'&h='+str(image_height)+\
    '&desired=false&legend=0&ilt=1&tp=all&fo=0&lang=en'

    # Extract price data for times when the item was sold by Amazon
    dates, prices = scrape_prices(url, image_width, image_height)

    # If the price data are insufficient, notify the user
    if dates is None or \
       np.size(dates) < 60 or \
       np.sum(np.isnan(prices))/np.size(dates) > .3:
        return None, None

    # Impute nans
    prices = impute_nan(prices)
    # Convert the price history to a dataframe
    history = pd.DataFrame({'date': dates, 'price': prices})

    return history

## Scrape price data for a single product

In [None]:
def scrape_prices(url, image_width, image_height):
    """ 
    Extract dates and prices from a camelcamelcamel item URL.
  
    Parameters: 
    url (str): camelcamelcamel URL for a product.
    image_width (int): width of the image to be used, in pixels
    image_height (int): height of the image to be used, in pixels
  
    Returns: 
    dates: numpy array of dates at 12-hour intervals
    prices: a numpy array of prices
    """

    ################
    # Collect data #
    ################

    # Define colors of elements of the plot (RGB)
    # Plotted lines
    plot_colors = np.array([[194, 68, 68], [119, 195, 107], [51, 51, 102]])
    # Gray axis lines
    gray = np.array([215, 215, 214])
    # Black axis lines
    black = np.array([75, 75, 75])

    # Download the image
    response = requests.get(url)
    image_temp = Image.open(BytesIO(response.content))

    # Convert image to float
    im = np.array(image_temp)

    # Get masks for each plot color
    masks = list()
    for i in range(3):
        masks.append(np.all(im == plot_colors[i], axis=-1))

    # Check if there image is empty (camel has no data)
    if not np.any(masks[1]):
        return None, None

    ######################
    # Find x and y scale #
    ######################

    # Find the y axis upper limit
    # Crop a portion of the image containing the top of the grid
    top_line_crop = im[:, round(image_width * .5) - 5:round(image_width * .5) + 6, :]
    # Get the position of the line
    line_y_value = find_line(top_line_crop, gray)

    # If it wasn't found, quit
    # Checks of this nature are rarely needed, as long
    # as camel keeps their plotting code the same
    if line_y_value is None:
        return None, None
    else:
        line_y_value = int(line_y_value)

    # Find x axis limits
    # Crop the left-most and right-most vertical lines in the grid
    left_line_crop = np.transpose(
        im[round(image_height * .5) - 8:round(image_height * .5) +
           9, :round(image_width * .1), :],
        axes=[1, 0, 2])
    right_line_crop = np.transpose(im[round(image_height * .5) -
                                      8:round(image_height * .5) + 9,
                                      round(image_width * .7):, :],
                                   axes=[1, 0, 2])
    lo_x_value = find_line(left_line_crop, black)
    hi_x_value = find_line(right_line_crop[::-1, :, :], gray)
    if lo_x_value is None or hi_x_value is None:
        return None, None
    else:
        lo_x_value = int(lo_x_value)
        hi_x_value = int(hi_x_value)

    # Find price corresponding to the y axis upper limit
    # First, crop the price text
    upper_price_crop = im[line_y_value - 8:line_y_value + 10,
                          0:lo_x_value - 9, :]
    upper_price_crop = Image.fromarray(upper_price_crop)
    # Resize and apply OCR
    upper_price_crop = upper_price_crop.resize(
        (upper_price_crop.width * 12, upper_price_crop.height * 12))
    upper_price_string = pytesseract.image_to_string(upper_price_crop)
    upper_price = float(upper_price_string[1:].replace(',', ''))

    # Store y position of price limits
    # The position and price of the lower limit are constant
    limit_y_positions = np.array([line_y_value, image_height - 49])

    # Calculate dollars per pixel
    dollarspp = upper_price / (np.max(limit_y_positions) -
                               np.min(limit_y_positions))

    # Crop year text from bottom of image so that we
    # can find the date of the first timepoint
    year_crop = im[-14:, 0:round(image_width / 8), :]
    year_crop = Image.fromarray(year_crop)
    # Resize and apply OCR
    year_crop = year_crop.resize((year_crop.width * 5, year_crop.height * 5))
    year_string = pytesseract.image_to_string(year_crop, config='--psm 7')
    year_string = year_string[:4]

    # Crop month and day from bottom left corner
    date_crop = im[-49:-14, (lo_x_value - 40):(lo_x_value + 6), :]
    # Convert to image
    date_crop = Image.fromarray(date_crop)
    # Invert, so that rotation works
    date_crop = ImageOps.invert(date_crop)
    # Pad the image
    date_crop_padded = Image.new(
        'RGB', (round(date_crop.width * 1.5), round(date_crop.height * 1.5)),
        (0, 0, 0))
    date_crop_padded.paste(date_crop, box=(0, round(date_crop.height * .5)))
    # Resize
    date_crop_padded = date_crop_padded.resize(
        (date_crop_padded.width * 7, date_crop_padded.height * 7),
        resample=Image.LANCZOS)
    # Rotate and invert
    date_crop_padded = ImageOps.invert(date_crop_padded.rotate(-45))
    # Crop
    date_crop_padded = date_crop_padded.crop((1,85,297,260))
    # Apply OCR
    date_string = pytesseract.image_to_string(date_crop_padded)
    # Find closest match to a month
    start_month = difflib.get_close_matches(date_string, [
        'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct',
        'Nov', 'Dec'
    ],
                                            n=1,
                                            cutoff=0.2)
    # Quit if no month was found
    if np.size(start_month) < 1:
        return None, None

    start_month = start_month[0]

    # Get the day of the first timepoint
    # Try to fix mixups between 'o' and 0
    if date_string[-1] == 'o':
        date_string = date_string[:-1] + '0'
    # Remove all whitespace
    date_string_stripped = "".join(date_string.split())
    # Take last 2 digits if the second-to-last is reasonable
    if date_string_stripped[-2].isdigit() and 0 < int(
            date_string_stripped[-2]) < 4:
        start_day = date_string_stripped[-2:]
    else:
        start_day = '0' + date_string_stripped[-1]

    # Store x axis locations of time limits
    limit_x_positions = [lo_x_value, image_width - hi_x_value]

    # Check if our date is valid
    try:
        start_time = datetime.datetime.strptime(
            start_month + start_day + year_string, '%b%d%Y')
    except ValueError:
        return None, None

    # Get current time
    end_time = datetime.datetime.now()

    # Calculate days per pixel
    time_delta = end_time - start_time
    dayspp = time_delta.days / int(1 + np.diff(limit_x_positions))

    # Get number of observations
    num_obs = int(np.diff(limit_x_positions))

    # Preallocate prices as nan
    prices = np.ones(num_obs) * np.nan

    ##################
    # Extract prices #
    ##################

    # Find y-axis value of blue pixels in each time step -
    # these are the prices we're looking for
    y = [[i for i, x in enumerate(q) if x] for q in np.transpose(
        masks[2][:, limit_x_positions[0]:limit_x_positions[1]])]

    # Adjust values if necessary, then convert to prices
    # Missing data are set to nan
    for i in range(num_obs):
        # Check if the bottom of the blue line is covered by a red or green line
        if np.size(y[i]) == 1:
            if masks[0][int(y[i][0]) + 1, limit_x_positions[0] +
                        i] or masks[1][int(y[i][0]) + 1,
                                       limit_x_positions[0] + i, ]:
                y[i][0] += 1

        # Check if the blue line is covered by both red and green lines
        if np.size(y[i]) == 0:
            red_idx = [q for q, x in enumerate(masks[0][:, limit_x_positions[0] + i]) if x]
            grn_idx = [q for q, x in enumerate(masks[1][:, limit_x_positions[0] + i]) if x]
            if np.size(red_idx) == 1 and np.size(grn_idx) == 1 and np.abs(
                    int(red_idx[0]) - int(grn_idx[0])) == 1:
                y[i] = grn_idx
            else:
                y[i] = np.nan

        prices[i] = dollarspp * (image_height - np.max(y[i]) - 50)

    # Adjust periods with no data
    # First, find nans and convert to a str for regex searching
    nans = ''.join([str(int(np.isnan(i))) for i in prices])
    # Ensure the beginnings of empty periods are correct
    matches = [m.span() for m in re.finditer('000110011', nans)]
    for match in matches:
        prices[match[0] + 3:match[0] + 5] = prices[match[0] + 5]
    # Then remove empty periods
    nans = ''.join([str(int(np.isnan(i))) for i in prices])
    matches = [m.span() for m in re.finditer('1100', nans)]
    for match in matches:
        prices[match[0] + 2:match[0] + 4] = np.nan

    ###################
    # Resample prices #
    ###################

    # Resample to 2x daily observations at 6:00 and 18:00
    # First, get the dates of our observations
    dates = pd.date_range(start_time, end_time,
                          periods=num_obs).to_pydatetime()
    # Initialize new dates and prices at the desired interval
    dates_2x_daily = pd.date_range(datetime.datetime(start_time.year,
                                                     start_time.month,
                                                     start_time.day, 6),
                                   datetime.datetime(end_time.year,
                                                     end_time.month,
                                                     end_time.day, 18),
                                   freq='12H').to_pydatetime()
    prices_2x_daily = np.ones(np.size(dates_2x_daily)) * np.nan

    # Find price at the closest date to each timepoint
    for i in range(np.size(dates_2x_daily)):
        prices_2x_daily[i] = prices[take_closest_date(dates -
                                                      dates_2x_daily[i])]

    # Make sure most recent price is correct
    prices_2x_daily[-1] = prices[-1]
    # Round prices to 2 decimal places
    prices_2x_daily = np.around(prices_2x_daily, 2)

    return dates_2x_daily, prices_2x_daily

## Find the closest date in a list

In [None]:
# This function is a modified version of the one posted by
# Lauritz V. Thaulow on stackoverflow at
# https://stackoverflow.com/a/12141511
def take_closest_date(myList):
    """
    Assumes myList is sorted. Returns index of closest value to myNumber.

    If two numbers are equally close, return the index of the smallest number.
    """
    pos = bisect_left(myList, datetime.timedelta(0))
    if pos == 0:
        return 0
    if pos == len(myList):
        return len(myList) - 1
    before = myList[pos - 1]
    after = myList[pos]
    if abs(after) < abs(before):
        return pos
    else:
        return pos - 1

## Locate a line in an image

In [None]:
def find_line(img, c):
    """ 
    Find the position of a line that bisects an image horizontally. 
    If there are multiple lines, only the first is returned.
  
    Parameters: 
    img (float): numpy array containing the image to search.
    c (float): 3-element numpy array equal to the line's color.
  
    Returns: 
    int: Location of the line, in pixels from the top of the image.
  
    """
    # Colors in the image should be withing this range of the target
    color_tolerance = 15
    # This fraction of the image should contain matching colors
    # to be considered a line
    match_threshold = .75
    # Get the width of the image, in pixels
    img_width = np.size(img, 1)
    # Find all pixels within color tolerance
    img[(img < c - color_tolerance) | (img > c + color_tolerance)] = 0
    img[img > 0] = 1
    # Only take pixels where all channels are within color tolerance
    mask2d = np.all(img, axis=-1)
    # Sum across columns
    sums = np.sum(mask2d, axis=1)
    # Find rows with sufficient matches
    matches = np.argwhere(sums > match_threshold * img_width)
    if np.size(matches) < 1:
        return
    else:
        return matches[0]

## Impute NaN values in price data

In [None]:
def impute_nan(x):
    """ 
    Impute nan values in a series of prices using most recent non-nan values.
  
    Parameters: 
    x (numpy array): A series of prices.
  
    Returns: 
    Array with nan values imputed.
    """
    # Convert result of isnan to a string for regex searching
    nans = ''.join([str(int(np.isnan(i))) for i in x])
    # Find groups of nans
    matches = [m.span() for m in re.finditer('0[^0]+', nans)]
    # Replace each with most recent non-nan value
    for match in matches:
        x[match[0] + 1:match[1]] = x[match[0]]
    return x

# Download the dataset

Run this cell to actually download the data.

In [None]:
# Load the list of item URLs
db = pd.read_csv(os.path.join('data', 'links', product))
# Extract the ASIN from each
product_ids = db['Camel link'].str.slice(start=-10)

download_data(product_ids, output_folder)