In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import re
from datetime import datetime
import time
from selenium import webdriver

In [2]:
print('requests version:', requests.__version__)

requests version: 2.22.0


# Web scraping https://www.fastshop.com.br/web/

Main search: https://www.fastshop.com.br/web/c/22561/smartphones

Possible "marcas": Apple (107), Samsung (80), Asus (31), Motorola (25), Multilaser (18), Huawei (7), LG (3), Sony (1). (as of 2020-05-07)

This website generate the tags with javascript, so we will need an advanced webscraping!
Only requests.get and beautiful soup won't be enough! We will need to use **selenium**!

The best aproach until now was:
* 1- Since products only load when we scroll down the page, we will use selenium until all products are shown
* 2- We then do a for loop and analyse each soup product, collect values/text/links and store in a datafram.
* 3- The dataframe is then stored in a csv.

# General Functions and main code

In [3]:
def get_products_with_selenium(url):
    """This function uses selenium to access a url/website from fastshop.
    It will use scrolling down to load all products, since fastshop.com uses
    javascript to generate values, so seeking the page html source won't work.
    It returns a list with products selected from the html elements.
    This function works like this:
    1- instantiate driver and get url.
    2- scroll down and detect the number of products.
    3- it returns the products list when the number of "detected" products
    doen't change.
    
    """
    
    driver = webdriver.Chrome(executable_path='../../chromedriver.exe')

    driver.get(url)

    product_number = 1

    while True:

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(10)

        content_element = driver.find_element_by_css_selector('body')
        content_html = content_element.get_attribute("innerHTML")
        soup = BeautifulSoup(content_html, "html.parser")
        products = soup.find_all('app-product-item')

        if product_number == len(products):
            break
        else:
            product_number = len(products)

    return products

In [4]:
def tagslist_to_dataframe(products):
    """This function receives a bs4.element.Tags list scraped from the fastshop.com website
    and returns a dataframe with informations like: title, prod_description, link for product
    link for image and a timestamp
    """
    results = pd.DataFrame()

    for product in products:
        image = product.find_all('img')[1]
        minidf = pd.DataFrame({'title':image['title'],
                               'prod_description':product.text,
                               'prod_link':'https://www.fastshop.com.br/' + product.find_all('a')[0]['href'],
                               'img_src':image['src'],
                               'timestamp':datetime.now()}, index=[0])
        results = pd.concat([results,minidf])

    return results.reset_index()

In [5]:
def save_to_csv(dataframe, name):
    """This function takes a dataframe and stores it with a yyyy-mm-dd-hh.csv stamp,
    with cp1252 enconding"""
    t = datetime.now()
    time = f'{str(t.year)}y-{str(t.month)}m-{str(t.day)}d-{str(t.hour)}h'
    dataframe.to_csv('storage/' + name + time + '.csv', encoding='cp1252', index=False)
    print(name + ' saved at: ' + time)

In [6]:
url = 'https://www.fastshop.com.br/web/c/22561/smartphones'
products = get_products_with_selenium(url)

In [7]:
results = tagslist_to_dataframe(products)
save_to_csv(results, 'fastshop_raw')

fastshop_raw saved at: 2020y-5m-14d-15h
