# Try scrapping stockx

Scraping pipeline:
1. Get entire catalog --> figure out individual product link
2. Product link --> sku, description
3. sku --> transaction data (activity)
4. Example Transaction data plot
5. Combining transaction activity data of all shoes
6. Combining Product detail and transaction activity of all products

In [None]:
import requests
from bs4 import BeautifulSoup as bs4
import re
import json
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import phonenumbers
import pandas as pd
import csv
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
base_url = 'https://stockx.com'

In [None]:
# predefined fuctions and settings

# To move columns in a pandas table
def movecol(df, cols_to_move=[], ref_col='', place='After'):

    cols = df.columns.tolist()
    if place == 'After':
        seg1 = cols[:list(cols).index(ref_col) + 1]
        seg2 = cols_to_move
    if place == 'Before':
        seg1 = cols[:list(cols).index(ref_col)]
        seg2 = cols_to_move + [ref_col]

    seg1 = [i for i in seg1 if i not in seg2]
    seg3 = [i for i in cols if i not in seg1 + seg2]

    return(df[seg1 + seg2 + seg3])
# return substring between start and end of input string s
def find_between(s, start, end):
    return (s.split(start))[1].split(end)[0]

# utility to save figure
def save_fig(fig_id, tight_layout=True):
    path = os.path.join(fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

# display all columns
pd.set_option('display.max_columns', None)

1. Getting URLs for individual snealer items from catalog

In [None]:
all_data = []
def get_urls(url = base_url):
    end_url = '/sneakers?page='
    # empty dictionary to store urls
    urls = []
    # for each page in results page
    for page in range(1,25):
        # build url
        url = base_url + end_url + str(page)

        # retrieve urls
        rsp = requests.get(url,headers = { 'User-Agent': 'Opera/9.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.01', })
        body = bs4(rsp.text, 'html.parser')
        #print(body.prettify())
        listings = body.find_all('div', "tile Tile-c8u7wn-0 bCufAv")

        # store urls in list
        for listing in listings:
            urls.append(listing.a['href'])

        time.sleep(1)  # seconds

    # write list to csv
    with open('urls.csv', 'w', newline='') as file:
        writer = csv.writer(file, delimiter=',')
        for row in urls:
            writer.writerow([row])

    return urls

# run script
urls = get_urls()
print(len(urls))


2. Getting SKU and product details from URLs

In [None]:
f=open('urls.csv',"r")
urls = f.read().split('\n')
f.close()
urls[0]

In [None]:
# go to individual product page url and save product detail into individual txt files
def get_SKU(urls):

    for i in range(0,len(urls)):
        # build url
        item_url = base_url + urls[i]

        sess = requests.session()

        headers = {
            "referer": "https://stockx.com/",
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'"
            }

        req = sess.get(item_url, headers = headers)
        fname = os.path.join('shoes',urls[i][1:] + '.txt')
        print(i,' ', req.status_code,' ',fname)
        if req.status_code != 200:
            print('scrapping failed')
            break
        soup = bs4(req.text)
        product = soup.findAll('div','product-view')

        f = open(fname,"w")
        f.write(str(product))
        f.close()
        time.sleep(3)  # seconds


# run script
get_SKU(urls)


In [None]:
# parse each text file that contains product detail and sort those information into dataframe
product_table = pd.DataFrame(columns = ['name', 'description','image_link', 'release_date', 'model', 'sku','color'])
shoe_dir = os.path.join(os.getcwd(),'shoes')

for filename in os.listdir(shoe_dir):
    try:
        with open(os.path.join(shoe_dir, filename), 'r') as f:
            content = f.read()
            x = content.split('{')[1]
            name = find_between(x,'\"name\":',',')[1:-1]
            desc = find_between(x,'\"description\":',',')[1:-1]
            image = find_between(x,'\"image\":',',')[1:-1]
            release_date= find_between(x,'\"releaseDate\":',',')[1:-1]
            model = find_between(x,'\"model\":',',')[1:-1]
            sku = find_between(x,'\"sku\":',',')[1:-1]
            color = find_between(x,'\"color\":',',')[1:-1]
            tmp = pd.DataFrame({'name': [name], 'description':[desc],'image_link': [image], 'release_date':[release_date], 'model':[model], 'sku':[sku],'color':[color]})
            product_table = product_table.append(tmp)
    except IndexError:
        print('Failed File: ', filename)
product_table.to_csv('product_table.csv')
print(product_table.shape)
product_table.head()

3. Scrapping Transaction data given SKU


In [None]:
def text_to_df(str):
    # helper function to parse activity (transaction) data
    df = pd.DataFrame()
    record = re.findall('\{.*?\}',str)
    for i in range(len(record)):
        dict = {}
        ss = record[i].split(',')
        for sss in ss:
            ssss= sss.split(':')
            dict[ssss[0][1:-1]] = ssss[1]
        tmp_df = pd.DataFrame(dict, index = [0])
        df = df.append(tmp_df)
    return df

In [None]:
def get_activity(sku, p_name,idx):
    # scrape transaction data of each product based on sku
    page = 1
    all_activity = ''
    while True:
        item_activity_url = base_url +'/api/products/'+sku + '/activity?state=480&currency=USD&limit=20000&page='+str(page)+'&sort=createdAt&order=DESC&country=US'
        item_activity_url
        sess = requests.session()
        #print(item_activity_url)
        headers = {
            "referer": base_url + '/' + p_name,
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'"
            }

        req = sess.get(item_activity_url, headers = headers)
        print(idx,' ',req.status_code,' ', p_name)
        content= req.text
        activity = find_between(content,'[',']')
        all_activity = all_activity + activity

        next_page = re.search('\"nextPage\"\:.*?,',content).group(0).split(':')[1]
        #print(next_page)

        if next_page == 'null,': # scrape until there is no next page
            break
        page +=1

    #print(len(all_activity))
    activity_df = text_to_df(all_activity)
    return activity_df

In [None]:
# loop through each product and get transaction data
product_table = pd.read_csv('product_table.csv')
for index,row in product_table.iterrows():
    if index >= 0: # index to start
        try:
            sku, p_name = row['sku'], row['name']
            product_activity = get_activity(sku,p_name, index)
            product_activity['product_name'] = p_name
            product_activity.to_csv(os.path.join('activity',p_name+'activity.csv'))
        except FileNotFoundError:
            print('Failed File: ', p_name)

4. Example plot of Transaction Activity of a representative shoe

In [None]:
activity_df=pd.read_csv('adidas Yeezy Boost 350 V2 Sulfuractivity.csv')
activity_df['amount'] = activity_df['amount'].astype('float')
activity_df['createdAt'] = activity_df['createdAt'].str.strip('\"')
activity_df['shoeSize'] = activity_df['shoeSize'].str.strip('\"')
activity_df['shoeSize'] = activity_df['shoeSize'].astype('float')
activity_df.head()

In [None]:
activity_df['createdAt']=pd.to_datetime(activity_df['createdAt'])
activity_df.info()

In [None]:
plt.figure(figsize=(20,10))
sns.set_palette("husl")
chart = sns.lineplot(x = activity_df['createdAt'], y = activity_df['amount'], hue = 'shoeSize',palette='RdBu', data = activity_df)
plt.title('Jordan 1 Retro High Court Purple White')
#chart.set_xticklabels(rotation=65, horizontalalignment='right')
save_fig('activity_test_fig')



5. Combining transaction activity data of all shoes

In [None]:
def get_specialSize(shoeSize):
    # helper function to parse special sizes
    if re.search('2E Wide',shoeSize):
        return shoeSize.split('2E Wide')[0], '2E Wide' #wide
    if re.search('W',shoeSize):
        return shoeSize.split('W')[0], 'W' # women
    if re.search('Y',shoeSize):
        return shoeSize.split('Y')[0], 'Y' # youth
    if re.search('K',shoeSize):
        return shoeSize.split('K')[0], 'K' # toddler
    return shoeSize, ''



In [None]:
# loop through transaction activity text file of each product to parse and sort data into all_transactions_df
transaction_by_shoe = {}
all_transaction_df= pd.DataFrame(columns = ['product_name', 'shoeSize', 'createdAt', 'amount', 'localAmount',
       'localCurrency', 'specialSize','chainId'])
all_transaction_df.head()
shoe_dir = os.path.join(os.getcwd(),'activity')
for filename in tqdm(os.listdir(shoe_dir)):
    try:
        with open(os.path.join(shoe_dir, filename), 'r') as f:
            content = f.read()
            activity_df=pd.read_csv(os.path.join('activity',filename))
            activity_df['createdAt'] = activity_df['createdAt'].str.strip('\"')
            activity_df['shoeSize'] = activity_df['shoeSize'].str.strip('\"')
            activity_df['chainId'] = activity_df['"chainId'].str.strip('\"')
            activity_df['localCurrency'] = activity_df['localCurrency'].str.strip('\"')
            activity_df['localCurrency'] = activity_df['localCurrency'].str.rstrip('\"')
            activity_df['localCurrency'] = activity_df['localCurrency'].str.strip('\}')
            activity_df['createdAt']=pd.to_datetime(activity_df['createdAt'])
            activity_df['amount'] = activity_df['amount'].astype('float32')
            activity_df['localAmount'] = activity_df['localAmount'].astype('float32')
            activity_df[['shoe_size','specialSize']] = activity_df.apply(lambda x: pd.Series(get_specialSize(x['shoeSize'])),axis=1)
            activity_df['shoe_size'] = activity_df['shoe_size'].astype('float32')
            activity_df = activity_df[['product_name','shoe_size','createdAt','amount', 'localAmount','localCurrency','specialSize','chainId']]
            transaction_by_shoe[filename[:-12]] = activity_df
            activity_df.to_csv(os.path.join('activity_df',(filename[:-4] + '_df.csv')))
            all_transaction_df = all_transaction_df.append(activity_df)
    except ValueError:
        print('Special shoe size caught: ', filename)
    except IndexError:
        print('Failed File (Index Error): ', filename)
    except KeyError:
        print('Failed File (Key Error): ', filename)

all_transaction_df = all_transaction_df[['product_name', 'shoe_size', 'createdAt', 'amount',
       'localAmount', 'localCurrency', 'specialSize', 'chainId']]
all_transaction_df.to_csv('all_transactions_df.csv')



6. Combining Product detail and transaction activity of all products

In [None]:
# get product details
products = pd.read_csv('product_table.csv')
products[['color1','color2']] = products['color'].str.split('/', n = 1, expand = True)
products_df = products[['name', 'model', 'color1','color2', 'release_date','description']]
products_df = products_df.replace('--', np.NaN)
products_df['release_date']=pd.to_datetime(products_df['release_date'])
products_df.to_csv('prodcuts_df.csv')

In [None]:
# parse columns
products_df = pd.read_csv('prodcuts_df.csv')
all_transaction_df = pd.read_csv('all_transaction_df.csv')
products_df['release_date'] = pd.to_datetime(products_df['release_date'], utc=True)
all_transaction_df['createdAt'] = pd.to_datetime(all_transaction_df['createdAt'], utc=True)
products_df = products_df.drop(columns= ['Unnamed: 0'])
all_transaction_df = all_transaction_df.drop(columns = ['Unnamed: 0'])
print(products_df.info(),'\n',all_transaction_df.info())



In [None]:
# merge products and activity dataframes; d_release column created to capture how long has the product been\n
# released when the transaction happened
all_trans_and_shoe = products_df.merge(all_transaction_df, left_on='name', right_on = 'product_name')
all_trans_and_shoe = all_trans_and_shoe.fillna(value = {'release_date':'1985-01-01 00:00:00+00:00'})
#original earliest release: 1985-09-16 00:00:00+00:00
# fill release date NAs to the first year a sneaker is released
all_trans_and_shoe = all_trans_and_shoe.fillna(value = {'release_date':'1985-01-01 00:00:00+00:00'})
all_trans_and_shoe['release_date'] = pd.to_datetime(all_trans_and_shoe['release_date'], utc=True)
all_trans_and_shoe['d_release'] = all_trans_and_shoe['createdAt']-all_trans_and_shoe['release_date']

In [None]:
# sort index by time, save
all_trans_and_shoe.set_index('createdAt', drop=False, inplace=True)
all_trans_and_shoe = all_trans_and_shoe.sort_index()
all_trans_and_shoe.to_csv('all_trans_and_shoe.csv')
all_trans_and_shoe.head()