In [24]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import re
from datetime import datetime
import time

In [92]:
print('requests version:', requests.__version__)

requests version: 2.22.0


# Web scraping www.americanas.com.br

Main search: https://www.americanas.com.br/.../smartphone/{manufacturer}/pagina-2?ornenacao={orderby}

Possible manufacturers: samsung-galaxy, iphone, motorola, zenfone, lg, alcatel, quantum, quantum, q-touch, huawei, zte, sony-xperia, lenovo, meizu, lumia, xiaomi, yezz and multilaser.

THIS WEBSITE ONLY SHOWS 24 items PER PAGE! The stragegy is to look for the number of products on
the first page (the website shows the number of products in the query) and divide this number by
24, to know how many pages to search, per brand...

The best aproach until now was:
* 1- create 3 functions, for each operating system. The website from americanas.com let you chain the operation systems of the products (windows, ios, android, etc), so the strategy was to create 3 massive links, for each function, and iterate over the pages on those links. Hopefully the majority of the smartphones will be found.
* 2- For each operational system, the main code works this way: 
> - soup the first url, then look the amount of products.
> - the number of products divided by 24 and rounded up will give the amount of pages.
> - Iterate over the number of pages (from 2 to max), get soups, and store information in dataframes (page, description of product, crude link for the product, link for the product, and a timestamp)
> - At the end concatenate all dataframes together

# General Functions

In [52]:
def get_number_of_pages(soup):
    """This function looks for the number or products on the page query and returns the number of pages.
    The number of pages will help iterage over the brand pages"""
    nproducts = soup.find_all('div', attrs={'class': "form-group display-sm-inline-block"})[0].text.split(' ')[0]
    if '.' in nproducts:
        nproducts = nproducts.split('.')[0] + nproducts.split('.')[1]
    print(f'number of products are: {nproducts}')
    npages = str(int(nproducts)/24).split('.')[0]
    return int(npages) + 1

In [94]:
def extract_products_data_from_soup(soup, page):
    """
    This function receives a soup from americanas.com and returns a dataframe with infos
    from the 24 products.
    """
    products = soup.find_all('div', attrs={'class':"RippleContainer-sc-1rpenp9-0 dMCfqq"})

    results = pd.DataFrame()

    for product in products:
        texto_produto = product.text
        link_produto_crude = product.find_all('a')[0]['href']
        link_produto = 'https://www.americanas.com.br' + link_produto_crude.split('?')[0]

        minidf = pd.DataFrame({'page': page,
                                'texto_produto':texto_produto,
                                'link_produto_crude':link_produto_crude,
                                'link_produto':link_produto,
                                'timestamp': datetime.now()}, index=[0])

        results = pd.concat([results,minidf])
    
    return results.reset_index(drop=True)

In [29]:
def get_soup_from_url(url):
    """This function gets an url and returns a bs4 soup"""
    
    headers = {
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
        }
    response = requests.get(url,headers=headers)
    soup = BeautifulSoup(response.content)
    return soup

In [107]:
def save_to_csv(dataframe, name):
    """This function takes a dataframe and stores it with a yyyy-mm-dd-hh.csv stamp,
    with cp1252 enconding"""
    t = datetime.now()
    time = f'{str(t.year)}y-{str(t.month)}m-{str(t.day)}d-{str(t.hour)}h'
    dataframe.to_csv('storage/' + name + time + '.csv', encoding='cp1252')
    print(name + ' saved at: ' + time)

# IOS system functions

In [73]:
def ios_url(page):
    """
    This function gets the page number and returns the URL for the IOS
    devices from americanas.com.
    """
    ios0 = 'https://www.americanas.com.br/categoria/celulares-e-smartphones/smartphone/f'
    ios1 = '/sistema-operacional-ios'
    ios2 = '/sistema-operacional-ios%2010'
    ios3 = '/sistema-operacional-ios%2011'
    ios4 = '/sistema-operacional-ios%207'
    ios5 = '/sistema-operacional-ios%208'
    ios6 = '/sistema-operacional-ios%209'
    ios7 = '/sistema-operacional-iphone%20ios'
    if page == 1:
        pag = ''
    else:
        pag = f'/pagina-{page}'
    end = '?ordenacao=higherPrice'

    return ios0 + ios1 + ios2 + ios3 + ios4 + ios5 + ios6 + ios7 + pag + end

In [74]:
def generate_ios_results():
    soup = get_soup_from_url(ios_url(1))

    npages = get_number_of_pages(soup)

    ios_results = extract_products_data_from_soup(soup, page=1)

    for page in tqdm(np.arange(2,npages+1)):
        soup = get_soup_from_url(ios_url(page))
        minidf = extract_products_data_from_soup(soup, page=1)
        ios_results = pd.concat([ios_results,minidf])

    save_to_csv(ios_results, 'americanas_IOS_raw')
    
    return ios_results.reset_index(drop=True)

# Android functions

In [75]:
def android_url(page):
    """
    This function gets the page number and returns the URL for the Android devices from americanas.com.
    """
    and0 = 'https://www.americanas.com.br/categoria/celulares-e-smartphones/smartphone/f'
    and00 = '/sistema-operacional-android/'
    and1 = '/sistema-operacional-android%20(miui).%20sistema%20operacional%20devidamente%20personalizado%20e%20otimizado%20pela%20xiaomi%20com%20funcionalidades%20exclusivas.'
    and2 = '/sistema-operacional-android%204'
    and3 = '/sistema-operacional-android%205'
    and4 = '/sistema-operacional-android%205.0%20lollipop'
    and5 = '/sistema-operacional-android%205.0.1%20lollipop'
    and6 = '/sistema-operacional-android%205.0.2%20lollipop'
    and7 = '/sistema-operacional-android%205.1%20asuszenuilollipop'
    and8 = '/sistema-operacional-android%205.1%20lollipop'
    and9 = '/sistema-operacional-android%205.1.1%20lollipop'
    and10 = '/sistema-operacional-android%206'
    and11 = '/sistema-operacional-android%207'
    and12 = '/sistema-operacional-android%207.1.1%20nougat'
    and13 = '/sistema-operacional-android%208.0'
    and14 = '/sistema-operacional-android%208.0%20oreo'
    and15 = '/sistema-operacional-android%208.1'
    and16 = '/sistema-operacional-android%208.1%20(versao%20go)'
    and17 = '/sistema-operacional-android%208.1%20oreo'
    and18 = '/sistema-operacional-android%20go'

    if page == 1:
        pag = ''
    else:
        pag = f'/pagina-{page}?'
    end = '?ordenacao=higherPrice'
    
    return and0 + and00 + and1 + and2 + and3 + and4 + and5 + and6 + and7 +\
            and8 + and9 + and10 + and11 + and12 + and13 + and14 +\
            and15 + and16 + and17 + and18 + pag + end

In [76]:
def generate_android_results():
    soup = get_soup_from_url(android_url(1))

    npages = get_number_of_pages(soup)

    android_results = extract_products_data_from_soup(soup, page=1)

    for page in tqdm(np.arange(2,npages+1)):
        soup = get_soup_from_url(android_url(page))
        minidf = extract_products_data_from_soup(soup, page=1)
        android_results = pd.concat([android_results,minidf])

    save_to_csv(android_results, 'americanas_Android_raw')
    return android_results.reset_index(drop=True)

# Windows and Others functions

In [77]:
def windows_url(page):
    """
    This function gets the page number and returns the URL for the windows and other devices from americanas.com.
    """
    win0 = 'https://www.americanas.com.br/categoria/celulares-e-smartphones/smartphone/f'
    win1 = '/sistema-operacional-outros'
    win2 = '/sistema-operacional-proprietary%20os'
    win3 = '/sistema-operacional-windows'
    win4 = '/sistema-operacional-windows%20phone'

    if page == 1:
        pag = ''
    else:
        pag = f'/pagina-{page}?'
    end = '?ordenacao=higherPrice'
    
    return win0 + win1 + win2 + win3 + win4 + pag + end

In [78]:
def generate_windows_results():

    soup = get_soup_from_url(windows_url(1))

    npages = get_number_of_pages(soup)

    windows_results = extract_products_data_from_soup(soup, page=1)

    for page in tqdm(np.arange(2,npages+1)):
        soup = get_soup_from_url(windows_url(page))
        minidf = extract_products_data_from_soup(soup, page=1)
        android_results = pd.concat([windows_results,minidf])

    save_to_csv(windows_results, 'americanas_windows_raw')
    return windows_results.reset_index(drop=True)

# Final Testing

In [95]:
ios = generate_ios_results()
android = generate_android_results()
windows = generate_windows_results()
americanas = pd.concat([ios,android,windows]).reset_index(drop=True)
print(f'{americanas.shape[0]} produtos foram cadastrados!')
save_to_csv(americanas, 'americanas_raw')

number of products are: 406


100%|██████████| 16/16 [00:25<00:00,  1.62s/it]


americanas_IOS_raw saved at: 2020y-5m-7d-11h
number of products are: 1987


100%|██████████| 82/82 [04:06<00:00,  3.01s/it]


americanas_Android_raw saved at: 2020y-5m-7d-11h


  0%|          | 0/3 [00:00<?, ?it/s]

number of products are: 77


100%|██████████| 3/3 [00:03<00:00,  1.19s/it]

americanas_windows_raw saved at: 2020y-5m-7d-11h
americanas_raw saved at: 2020y-5m-7d-11h



