# Browser Automation Homework ROUND 2
Due 7-16<br>
Completed by: **Teodora Curcic**

We're going to the United States trademark data base and going to collect all of Nike (or any company's) active trademarks.

We only want the live trademarks, and we want the `serial` number and the link to an image of each trademark. Feel free to collect the `wordmark` if you like.


Note: if you get asked if you're a bot, just complete the challenges manually.

In [1]:
import os
import random
import time

from playwright.async_api import async_playwright, expect, Keyboard

In [2]:
os.makedirs('data/', exist_ok=True)

In [3]:
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36'

In [4]:
async def open_browser(headless=False, user_agent=user_agent):
    """
    Starts the automated browser and opens a new window
    """
    # Start playwright
    playwright = await async_playwright().start()

    # Open firefox browser, can use chromium (chrome) or others
    browser = await playwright.chromium.launch(headless=False)

    # set a user agent
    context = await browser.new_context(user_agent=user_agent)
  
    # Create a new browser window
    page = await context.new_page()

    return browser, page

In [5]:
driver, page = await open_browser()

In [7]:
url = 'https://tmsearch.uspto.gov/search/'
await page.goto(url)

<Response url='https://tmsearch.uspto.gov/search/' request=<Request url='https://tmsearch.uspto.gov/search/' method='GET'>>

Only search by the "Owner" of each trademark

In [8]:
xpath_search_options = '//*[@id="mat-select-value-3"]/span/span'
filter = page.locator(xpath_search_options)

In [9]:
await filter.click()

In [10]:
xpath_option = '//*[@id="mat-option-11"]'

In [11]:
await page.locator(xpath_option).click()

Find the search bar and input the search

In [12]:
xpath_searchbar = '//*[@id="searchbar"]'

In [13]:
search = page.locator(xpath_searchbar)

In [14]:
# Change this if you like
company = 'Nike'

In [15]:
await search.fill(company)

Make the search

In [16]:
await page.keyboard.press("Enter")

Filter to only live contracts
<br>In other words, filter out the dead ones.

In [17]:
xpath_filter = '//*[@id="statusDead"]'

In [18]:
dead_button = page.locator(xpath_filter)

In [19]:
# click it
await dead_button.click()

# Save results

In [20]:
# how to save what the emulator sees
source = await page.content()
with open(f'data/trademarks_{company}.html', 'w') as f:
    f.write(source)

GO TO THE NEXT PAGE AND REPEAT

In [21]:
# this is the next page button
xpath_next = '//li[@class="page-item"][a[@class="page-link md-icon"][i[text()="navigate_next"]]]'

In [22]:
# What is the xpath for a result, make sure this is visible before going to next page.
xpath_result = '//div[@class="card m-2 result-card ng-star-inserted"]'

In [23]:
next_button = page.locator(xpath_next)

In [24]:
await expect(next_button).to_be_visible()

In [25]:
# Here we'll put it all together and iterate through.
# There are many ways to do this
i = 1
collect = True
while collect:
    # check the next button is visible
    if await page.is_visible(xpath_next):
        # click the next button
        next_button = page.locator(xpath_next)
        await next_button.click()

        # make sure the search result is visible
        await page.locator(xpath_result).first.is_visible()

        # save the contents
        source = await page.content()
        with open(f'data/trademarks_{company}_{i}.html', 'w') as f:
            f.write(source)
        i += 1
    else:
        collect = False
        break

## Parse the contents

In [26]:
import glob
import pandas as pd
from lxml import etree, html

In [27]:
# list the scraped pages to pars them
files = glob.glob('data/trademarks*.html')

In [33]:
# A trademark entry
xpath_trademark_cards = '//div[@class="card m-2 result-card ng-star-inserted"]'
# the serial number
xpath_serial = './/div[@class="row mb-2 ng-star-inserted"]//span'

In [34]:
data = []

for fn in files:
    # read into a XML tree from a string
    with open(fn) as f:
        tree = html.fromstring(f.read())

    trademarks = tree.xpath(xpath_trademark_cards)

    for trademark in trademarks:
        # extract serial (it's a list of elements)
        serial_elem = trademark.xpath(xpath_serial)
        serial = serial_elem[0].text.strip() if serial_elem and serial_elem[0].text else None

        # extract image URL
        img_elem = trademark.xpath('.//img')
        img_url = img_elem[0].get('src') if img_elem else None

        row = {'serial': serial, 'img_url': img_url}
        data.append(row)

In [35]:
df = pd.DataFrame(data)
df.head(5)

Unnamed: 0,serial,img_url
0,88872671,https://tmcms-docs.uspto.gov/cases/88872671/ma...
1,98005404,https://tmcms-docs.uspto.gov/cases/98005404/ma...
2,88831783,https://tmcms-docs.uspto.gov/cases/88831783/ma...
3,90731895,https://tmcms-docs.uspto.gov/cases/90731895/ma...
4,90492296,https://tmcms-docs.uspto.gov/cases/90492296/ma...


In [36]:
df.shape

(350, 2)

In [37]:
df.to_csv('data/trademarks.csv', index=False)