In [None]:
from IPython.core.display import HTML

HTML("""
<style>
    div.text_cell_render, .CodeMirror pre, div.output {
        font-size: 1.2em;
        line-height: 1.2em;
    }
    .container {
        width: 80%;
    }
</style>
""")

# Poshmark Pipeline MVP

The goal of this pipeline is to scrape data from Poshmark, process it, store it, and visualize it. 

1. Scrape listings for 10 brands
2. Store using pickle
3. Format data and engineer features
4. Store using pickle
5. Explore and visualize using pandas

In [None]:
# Imports
import os
import pickle
import logging
from glob import glob
from requests import get
from datetime import datetime, date
from time import sleep
from functools import reduce

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
from dateutil.parser import parse
from inflection import parameterize
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 

In [None]:
# Configure logging
logging.basicConfig(filename='scraping.log', 
                    filemode='w', 
                    format='%(asctime)s - %(message)s', 
                    datefmt='%d-%b-%y %H:%M:%S')

In [None]:
# Constants
HEADER = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
TODAY = date.today().strftime("%m_%d_%Y")

In [None]:
# Functions to download web pages
def download_page(url):
    "Download HTML source for a given URL"
    response = get(url, headers=HEADER)
    return response

def headless_download_page(url):
    "Download HTML source for the given city using a headless Firefox instance"
    options = Options()
#     options.headless = True
    driver = webdriver.Firefox(options=options)
    driver.get(url)
    sleep(5)
    html_text = driver.page_source
    
    return html_text

def collect_pages(brand):
    "Collect all the pages for a given search query"
    count = 1
    more_pages = True
    pages = []

    while more_pages:
        url = f"https://poshmark.com/brand/{brand}-Men-Jeans?availability=all&sort_by=added_desc&max_id={count}"
        print(url)
        page = headless_download_page(url)
        pages.append(page)
        count += 1
        sleep(5)
        
        soup = BeautifulSoup(page, 'html.parser')
        btns = soup.find_all('button', class_ = 'btn--pagination')

        if btns[-1].has_attr('disabled'):
            more_pages = False
        
        if count == 11:
            more_pages = False
        
    return pages

In [None]:
# Functions to extract data from HTML
def create_soup(source):
    "Convert HTML source to BeautifulSoup object"
    soup = BeautifulSoup(source, 'html.parser')
    return soup

def extract_tiles(soup):
    "Extract all the clothing tile elements"
    containers = soup.find_all('div', class_ = 'tile')
    return containers

def extract_title(tile):
    "Extract the title string from a tile"
    try:
        title = tile.find('a', class_='tile__title').get_text(strip=True)
    except:
        title = ''
        
    return title

def extract_status(tile):
    "Extract the status from a tile"
    try:
        status = tile.find('span', class_='condition-tag').get_text(strip=True)
    except:
        status = ''
        
    return status

def extract_stock(tile):
    "Extract the stock status from a tile"
    try:
        stock = tile.find('i', class_='sold-tag').get_text(strip=True)
    except:
        stock = ''
        
    return stock

def extract_price(tile):
    "Extract the price integer from a tile"
    try:
        price = tile.find('span', attrs={'data-test': 'tile-price'}).get_text(strip=True)
    except:
        price = ''
    
    return price

def extract_size(tile):
    "Extract the size integer from a tile"
    try:
        size = tile.find('a', attrs={'data-test': 'tile-size'}).get_text(strip=True)
    except:
        size = ''
    
    return size

def extract_brand(tile):
    "Extract the brand string from a tile"
    try:
        brand = tile.find('a', attrs={'data-test': 'tile-brand'}).get_text(strip=True)
    except:
        brand = ''
    
    return brand

def extract_link(tile):
    "Extract the link string from a tile"
    try:
        link = tile.find('a', class_='tile__title').get('href')
    except:
        link = ''
    
    return link

def extract_image(tile):
    "Extract the image link string from a tile"
    try:
        image = tile.find('img').get('data-src')
    except:
        image = ''
    
    return image
    
def extract_date(url):
    "Extract the posting date from a url"
    
    try:
        start = url.find('20')
        end = start + 10
        date = url[start:end]
    except:
        date = ''
    
    return date

def combine_data(tile):
    "Run independent functions and return object of all values"
    title = extract_title(tile)
    status = extract_status(tile)
    stock = extract_stock(tile)
    price = extract_price(tile)
    size = extract_size(tile)
    brand = extract_brand(tile)
    link = extract_link(tile)
    image = extract_image(tile)
    date = extract_date(image)
        
    return {
        'title': title,
        'status': status,
        'stock': stock,
        'price': price,
        'size': size,
        'brand': brand,
        'link': link,
        'image': image,
        'date': date,
    }

In [None]:
# Functions to format the data
def format_price(price_value):
    "Remove extra text and convert to int"
    try:
        price = int(price_value.replace('$', ''))
    except:
        price = np.nan
        
    return price

def format_size(size_value):
    "Remove extra text and convert to int"
    try:    
        size = int(size_value.replace('Size: ', ''))
    except:
        size = np.nan
        
    return size

def format_brand(brand_value):
    "Make universal format"
    try:    
        brand = parameterize(brand_value, '_')
    except:
        brand = ''
        
    return brand

def format_link(link_value):
    "Add domain to link value"
    try:
        link = 'http://www.poshmark.com' + link_value
    except:
        link = ''
        
    return link

def format_date(date_value):
    "Convert string date to datetime"
    try:
        date = parse(date_value)
    except:
        date = np.nan
        
    return date

def format_record(record):
    "Format individual values of the record"
    record['price'] = format_price(record['price'])
    record['size'] = format_size(record['size'])
    record['brand'] = format_brand(record['brand'])
    record['link'] = format_link(record['link'])
    record['date'] = format_date(record['date'])
    
    return record

In [None]:
# Functions to create new features
def find_diff(date):
    "Find the amount of days an item has been listed"
    try:
        now = datetime.now()
        diff = abs((date-now).days)
    except:
        diff = np.nan
        
    return diff

def calculate_length(title):
    "Find the length of the title"
    try:
        length = len(title)
    except:
        length = np.nan
    
    return length

def identify_condition(status):
    "Create boolean value for condition status"
    try:
        condition = bool(status)
    except:
        condition = False
    
    return condition

def check_stock(stock):
    "Create boolean value for stock status"
    try:
        condition = bool(stock)
    except:
        condition = False
    
    return condition

def create_features(record):
    "Create new features from record data"
    record['diff'] = find_diff(record['date'])
    record['length'] = calculate_length(record['title'])
    record['nwt'] = identify_condition(record['status'])
    record['sold'] = check_stock(record['stock'])
    
    return record

In [None]:
# Extract raw data for multiple brands
brands = [
    'J._Crew', 'Naked_&_Famous_Denim', "Levi's", 
    'Diesel', 'Hugo_Boss', 'Mavi', 'Big_Star', 
    'Lucky_Brand', "Joes's_Jeans", 'True_Religion', 
    'Wrangler', 'Gap', 'Uniqlo'
    ]

for tag in brands:
    print('Scraping', tag)
    pages = collect_pages(tag)
    soup_objs = [create_soup(page) for page in pages]
    item_tiles = [extract_tiles(soup) for soup in soup_objs]
    combined_tiles = reduce(lambda x,y: x+y, item_tiles)
    item_objs = [combine_data(tile) for tile in combined_tiles]

    brand_name = parameterize(tag, '_')
    pickle.dump(item_objs, open(f"./data/raw/{brand_name}_{TODAY}.p", 'wb'))
    logging.info(f"Scraped {tag} page, found {len(item_objs)} items")
    
    sleep(2)

In [None]:
# Process raw data 
files = [file for file in glob("./data/raw/*.p")]

for f in files:
    store = pickle.load(open(f, 'rb'))
    f_store = [format_record(item) for item in store]
    file_name = os.path.basename(f)
    pickle.dump(f_store, open(f"./data/intermediate/{file_name}", 'wb'))

In [None]:
# Add new features
files = [file for file in glob("./data/intermediate/*.p")]

for f in files:
    store = pickle.load(open(f, 'rb'))
    f_store = [create_features(item) for item in store]
    file_name = os.path.basename(f)
    pickle.dump(f_store, open(f"./data/processed/{file_name}", 'wb'))

In [None]:
# Examine the data
files = [file for file in glob("./data/processed/*.p")]
full_store = []

for f in files:
    store = pickle.load(open(f, 'rb'))
    full_store.extend(store)

In [None]:
df = pd.DataFrame(full_store)
print(df.info())
print('')
print(df.head())

In [None]:
df['brand'].unique()

In [None]:
numeric_df = df[['brand', 'price', 'size', 'diff', 'length']]
numeric_df.head()

In [None]:
# Check for extreme values
numeric_df.describe()

In [None]:
# Compare medians by brand
numeric_df.groupby('brand')['price', 'diff', 'length'].median().reset_index().rename(
    columns={'brand':'Brand', 'price':'Price', 'diff':'Days Listed', 'length':'Title Length'})

In [None]:
# Export to CSV
df.to_csv('./data/processed/source_data.csv', index=False)
numeric_df.to_csv('./data/processed/numeric_data.csv', index=False)

## Visualizing the distributions

Use `matplotlib` to plot and analyze the distributions in our data.

In [None]:
df = pd.read_csv('./data/processed/source_data.csv')

In [None]:
df.info()

In [None]:
df['price'].plot.hist(bins=12, alpha=0.5);

In [None]:
distinct_keys = df['brand'].unique()
for key in distinct_keys:
    plt.figure();
    df_subset = df[df.brand==key]
    df_subset['price'].plot.hist(bins=20, alpha=0.2, title=key);

In [None]:
# Distribution of days listed
df['diff'].plot.hist(bins=12, alpha=0.5);

In [None]:
distinct_keys = df['brand'].unique()
for key in distinct_keys:
    plt.figure();
    df_subset = df[df.brand==key]
    df_subset['diff'].plot.hist(bins=12, alpha=0.2, title=key);

In [None]:
# Distribution of title length
df['length'].plot.hist(bins=12, alpha=0.5);

In [None]:
sold_df = df[df['sold'] == True]
sold_df.info()

In [None]:
sold_df['price'].plot.hist(bins=12, alpha=0.5);

In [None]:
distinct_keys = sold_df['brand'].unique()
for key in distinct_keys:
    plt.figure();
    df_subset = sold_df[sold_df.brand==key]
    df_subset['price'].plot.hist(bins=12, alpha=0.2, title=key);

## Modeling 

Create a basic model to estimate the discount percentage.

In [None]:
df = pd.read_csv('./data/processed/source_data.csv')
df.info()

In [None]:
df2 = df[['brand', 'price', 'size', 'diff', 'length', 'nwt', 'sold']]
df2.head()

In [None]:
listed_df = df[df['sold'] == False]
listed_agg = listed_df.groupby('brand')['price', 'diff', 'length'].median().reset_index().rename(
    columns={'brand':'Brand', 'price':'Listed Price', 'diff':'Days Listed', 'length':'Title Length'})
listed_agg

In [None]:
sold_df = df[df['sold'] == True]
sold_agg = sold_df.groupby('brand')['price', 'diff', 'length'].median().reset_index().rename(
    columns={'brand':'Brand', 'price':'Sold Price', 'diff':'Days Listed', 'length':'Title Length'})
sold_agg

In [None]:
merged_inner = pd.merge(left=listed_agg[['Brand', 'Listed Price']], 
                        right=sold_agg[['Brand', 'Sold Price']], 
                        left_on='Brand', right_on='Brand')

In [None]:
merged_inner['Extra Discount'] = round(1 - (merged_inner['Sold Price'] / merged_inner['Listed Price']), 2) * 100

In [None]:
merged_inner