In [None]:
from IPython.core.display import HTML

HTML("""
<style>
    div.text_cell_render, .CodeMirror pre, div.output {
        font-size: 1.2em;
        line-height: 1.2em;
    }
    .container {
        width: 80%;
    }
</style>
""")

# Scraping Poshmark

This notebook walks through how to scrape listings from [Poshmark.com](https://poshmark.com/). Poshmark is a social commerce platform where people buy and sell new and used clothing, shoes and accessories.

![preview](images/poshmark-preview.png)


<br>

## Webpages 101

Websites are built using HTML and CSS. HTML provides the layout for websites. CSS provides the styling like font sizes, colors and spacing.

Scraping takes advantage of the inherent structure on webpages. We find data by using the repeating HTML elements and CSS classes on pages. 

CSS classes are repeatable styling given to components with similar styling. For example, the item cards on Poshmark, all have the same exact look and feel, so their HTML code looks rather similar.

Check out: [Diesel Jeans](https://poshmark.com/brand/Diesel-Men-Jeans?sort_by=added_desc) and use the Google Chrome Inspector.
<br><br>

![Diesel Code](images/diesel-code.png)

<br>

## Scrape the denim listings

We'll use `requests` to pull down the website. Then, we'll print out the response variable which contains the HTML code.

In [None]:
from requests import get

url = "https://poshmark.com/brand/Diesel-Men-Jeans?sort_by=added_desc"
response = get(url)
print(response.text[:500])

We can use `beautifulsoup` to parse the raw HTML. This is a package specially made for accessing HTML elements.

In [None]:
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)

We can use built-in methods to search for the repeating tiles in the markup. We take advantage of CSS classes here.

In [None]:
clothing_containers = html_soup.find_all('div', class_ = 'tile')
print(type(clothing_containers))
print(len(clothing_containers))

Let's take a look at the first tile.

In [None]:
first_tile = clothing_containers[0]
print(first_tile)

`BeautifulSoup` has a `prettify` method which allows use to make HTML code more readable with proper indentation.

In [None]:
print(first_tile.prettify())

## Extract the values

Now that we have the card isolated, we can extract the specific data points:
- Item Title
- Item Price
- Item Size
- Item Brand
- Item Page Link
- Item Image Link

In [None]:
# Item Title
first_title = first_tile.find('a', class_='tile__title')
print(first_title)

In [None]:
first_title = first_tile.find('a', class_='tile__title').get_text()
print(first_title)

In [None]:
first_title = first_tile.find('a', class_='tile__title').get_text(strip=True)
print(first_title)

<br>

In [None]:
# Item Price
first_price = first_tile.find('span', class_="fw--bold")
print(first_price)

In [None]:
first_price = first_tile.find('span', class_="fw--bold").get_text(strip=True)
print(first_price)

<br>

In [None]:
# Item Size
first_size = first_tile.find('a', class_="tile__details__pipe__size").get_text(strip=True)
print(first_size)

<br>

In [None]:
# Item Brand
first_brand = first_tile.find('a', class_="tile__details__pipe__brand").get_text(strip=True)
print(first_brand)

<br>

In [None]:
# Item Detail URL
first_link = first_tile.find('a', class_='tile__title').get('href')
print(first_link)

In [None]:
first_link = 'http://www.poshmark.com' + first_tile.find('a', class_='tile__title').get('href')
print(first_link)

<br>

In [None]:
# Item Image URL
first_image = first_tile.find('img')
print(first_image)

In [None]:
first_image = first_tile.find('img').get('src')
print(first_image)

<br>

## Format the data

We scraped the raw data. However, all the data are strings. We need to convert them to the appropriate types.

In [None]:
print('Title: ', first_title)
print('Price: ', first_price)
print('Size: ', first_size)
print('Brand: ', first_brand)
print('Link: ', first_link)
print('Image: ', first_image)

The title and links can stay the same. However, we will need to convert price and size to `int`.

In [None]:
fixed_price = first_price.replace('$', '')
print(type(fixed_price))
print(fixed_price)

In [None]:
fixed_price = int(first_price.replace('$', ''))
print(type(fixed_price))
print(fixed_price)

In [None]:
fixed_size = int(first_size.replace('Size: ', ''))
print(type(fixed_size))
print(fixed_size)

<br>

## Extracting a new feature

If you took a close look at the image URL, you can see the path actually shows when the posting was created. Let's extract that.

In [None]:
start_idx = first_image.find('2020')
print(start_idx)

In [None]:
end_idx = start_idx + 10
raw_date = first_image[start_idx:end_idx]
print(raw_date)

In [None]:
from dateutil.parser import parse

first_date = parse(raw_date)
print(first_date)

We can find the approximate days the item has been listed.

In [None]:
from datetime import datetime

now = datetime.now()
diff = abs((first_date-now).days)
print(diff)

<div class="alert alert-info">

<b>Note:</b> In a professional workflow, you would create separate versions of the data.

<br>

<ol>
    <li>Raw scraped data</li>
    <li>Type formatted data</li>
    <li>Data with new features</li>
</ol>

</div>

<br>

# 5 minute break

<br>

## Refactor code, create functions

Here we'll refactor our code and create functions to extract all the data. A "good" function follows these guidelines:

- Is sensibly named
- Has a single responsibility
- Includes a docstring
- Returns a value
- Is not longer than 50 lines

In [None]:
from dateutil.parser import parse
from datetime import datetime

def download_page(url):
    "Download HTML source for a given URL"
    response = get(url)
    return response

def create_soup(source):
    "Convert HTML source to BeautifulSoup object"
    soup = BeautifulSoup(source.text, 'html.parser')
    return soup

def extract_tiles(soup):
    "Extract all the clothing tile elements"
    containers = soup.find_all('div', class_ = 'tile')
    return containers

def extract_title(tile):
    "Extract the title string from a tile"
    title = tile.find('a', class_='tile__title').get_text(strip=True)
    return title

def extract_price(tile):
    "Extract the price integer from a tile"
    price_string = tile.find('span', class_="fw--bold").get_text(strip=True)
    price = int(price_string.replace('$', ''))
    return price

def extract_size(tile):
    "Extract the size integer from a tile"
    size_string = tile.find('a', class_="tile__details__pipe__size").get_text(strip=True)
    size = int(size_string.replace('Size: ', ''))
    return size

def extract_brand(tile):
    "Extract the brand string from a tile"
    brand = tile.find('a', class_="tile__details__pipe__brand").get_text(strip=True)
    return brand

def extract_link(tile):
    "Extract the link string from a tile"
    partial_link = tile.find('a', class_='tile__title').get('href')
    link = 'http://www.poshmark.com' + partial_link
    return link

def extract_image(tile):
    "Extract the image link string from a tile"
    image = tile.find('img').get('data-src')
    return image
    
def extract_date(url):
    "Extract the posting date from a url"
    start = url.find('20')
    end = start + 10
    raw_date = url[start:end]
    date = parse(raw_date)
    return date

def find_difference(date):
    "Find the amount of days an item has been listed"
    now = datetime.now()
    diff = abs((date-now).days)
    return diff

In [None]:
def combine_data(tile):
    "Run independent functions and return object of all values"
    try:
        title = extract_title(tile)
    except:
        title = ''
        
    try:
        price = extract_price(tile)
    except: 
        price = ''
        
    try:
        size = extract_size(tile)
    except:
        size = ''
    
    try:
        brand = extract_brand(tile)
    except:
        brand = ''
    
    try: 
        link = extract_link(tile)
    except:
        link = ''
        
    try:
        image = extract_image(tile)
    except:
        image = ''
        
    try:
        date = extract_date(image)
        difference = find_difference(date)
    except:
        date = ''
        difference = ''
        
    return {
        'title': title,
        'price': price,
        'size': size,
        'brand': brand,
        'link': link,
        'image': image,
        'date': date,
        'difference': difference 
    }

## Extract all tiles on initial page

Now we can use the function to extract all the data from the page.

In [None]:
url = "https://poshmark.com/brand/Naked_&_Famous_Denim-Men-Jeans"

page = download_page(url)
soup_obj = create_soup(page)
item_tiles = extract_tiles(soup_obj)
item_objs = [combine_data(tile) for tile in item_tiles]

In [None]:
print(len(item_objs))

In [None]:
print(item_objs[0])
print('-------')
print(item_objs[45])

In [None]:
from pprint import pprint

pprint(item_objs[0])

<div class="alert alert-warning">

<b>Note:</b> There are hundreds of listings, but we can only scrape the first 48.

<br>

<p>Modern websites use JavaScript to load additional results to prevent long initial load times. Our initial page download only includes the first set of listings. If you want to extract all the listings, you will have to use a headless browser. It creates a Chrome/Firefox instance in the background to mimic a real page visit.</p>

<br>

<p>That is a bit more complicated and out of the scope for now. I can demonstrate in a future talk.</p>

</div>

## Extract other denim brands

Let's extract the data for other denim brands as well.

In [None]:
brands = ['J._Crew', 'Naked_&_Famous_Denim', "Levi's", 'Diesel']
store = []

for tag in brands:
    url = f"https://poshmark.com/brand/{tag}-Men-Jeans"
    page = download_page(url)
    soup_obj = create_soup(page)
    item_tiles = extract_tiles(soup_obj)
    item_objs = [combine_data(tile) for tile in item_tiles]
    store.append(item_objs)

In [None]:
print(len(store))
print(len(store[0]))

In [None]:
brands = ['J._Crew', 'Naked_&_Famous_Denim', "Levi's", 'Diesel']
store = []

for tag in brands:
    url = f"https://poshmark.com/brand/{tag}-Men-Jeans"
    page = download_page(url)
    soup_obj = create_soup(page)
    item_tiles = extract_tiles(soup_obj)
    item_objs = [combine_data(tile) for tile in item_tiles]
    store.extend(item_objs)

In [None]:
print(len(store))
print(len(store[0]))

## Examing the scraped data

You can bring the data into `pandas` for further examination.

In [None]:
import pandas as pd

df = pd.DataFrame(store)
print(df.info())
print('')
print(df.head())

In [None]:
df['length'] = df['title'].map(lambda x: len(x))

In [None]:
df.head()

In [None]:
numeric_df = df[['brand', 'price', 'size', 'difference', 'length']]
numeric_df.head()

In [None]:
# Check for extreme values
numeric_df.describe()

In [None]:
# Compare medians by brand
numeric_df.groupby('brand')['price', 'difference', 'length'].median().reset_index().rename(
    columns={'brand':'Brand', 'price':'Price', 'difference':'Days Listed', 'length':'Title Length'})

In [None]:
# Export to CSV
# df.to_csv('data/source_data.csv')
# numeric_df.to_csv('data/numeric_data.csv')

## Visualizing the distributions

Use `matplotlib` to plot and analyze the distributions in our data.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline  

In [None]:
df['price'].plot.hist(bins=12, alpha=0.5);

In [None]:
distinct_keys = df['brand'].unique()
for key in distinct_keys:
    plt.figure();
    df_subset = df[df.brand==key]
    df_subset['price'].plot.hist(bins=12, alpha=0.2, title=key);

In [None]:
# Distribution of days listed
df['difference'].plot.hist(bins=12, alpha=0.5);

In [None]:
distinct_keys = df['brand'].unique()
for key in distinct_keys:
    plt.figure();
    df_subset = df[df.brand==key]
    df_subset['difference'].plot.hist(bins=12, alpha=0.2, title=key);

In [None]:
# Distribution of title length
df['length'].plot.hist(bins=12, alpha=0.5);

## Questions, Feedback and Ideas

If you have any questions, please use the chat.

If you would be so kind to [fill out this feedback form](https://tyshaikh.typeform.com/to/uHHKg1).

If you have any ideas or requests, please share them.

I'd like to run bi-weekly or even weekly demos like this for next 6 months. 