In [38]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import re
from datetime import datetime
import time
from selenium import webdriver
import math

# Web scraping https://www.maiscelular.com.br/

Main search: https://www.maiscelular.com.br/fichas-tecnicas/?aparelho=1&z=1

This website shows tecnical information about many cellphones. Its search page shows only 20 devices and (as of 2020-05-17) had
about 4923 devices in total

The best aproach until now was:
* 1 - access the main link and get number of pages
* 2 - for each page, get all possible infos from cellphones and hiperlinks
* 3 - access each of the 4 thousand, access links and get informations!
* 4 - store in a dataframe
* 5 - Finally, for each link obtained, extract all information from smartphone/cellphone using pd.read_html()

# General Functions and main code

## Extraction 1: which smartphones are available

### subfunctions

In [2]:
def get_url(page, results_per_page=1000):
    """This function receives a number (page) and returns an url for the website"""
    url1 = 'https://www.maiscelular.com.br/fichas-tecnicas/?aparelho=1'
    url2 = f'&z={page}'

    url = url1 + url2
    
    return url

In [3]:
def get_soup_from_url(url):
    """This function gets an url and returns a bs4 soup"""
    
    headers = {
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
        }
    response = requests.get(url,headers=headers)
    soup = BeautifulSoup(response.content)
    return soup

In [4]:
def get_array_of_pages(soup):
    """This function receies a bs4 element (soup) from maiscelular.com.br website and returns an array with of pages
    to iterate in the future."""
    number_of_smartphones = soup.find_all('span', attrs={'class':"d-none d-sm-block"})[0].text
    number_of_smartphones = int(re.findall('\d+',number_of_smartphones)[0])
    pages = np.arange(1,math.floor(number_of_smartphones/20 + 1)+1)
    
    return pages

In [5]:
def get_smartphone_info_from_page(soup):
    """This function receies a bs4 element (soup) from maiscelular.com.br website and returns
    a dataframe with useful information (mainly link to techical characteristics) about all smartphones
    from a page."""
    results = pd.DataFrame()

    img_name_infos = soup.find_all('div', attrs={'class':"bl1"})
    mini_fichas_tecnicas = soup.find_all('div', attrs={'class':"col-md-6 col-lg-6 resumo"})

    for info, ficha in zip(img_name_infos, mini_fichas_tecnicas):
        link = 'https://www.maiscelular.com.br/' + info.find_all('a')[0]['href']
        image_link = 'https:' + info.find_all('img')[0]['src']
        name = info.find_all('strong')[0].text
        screen = ficha.find_all('div')[0].text
        camera = ficha.find_all('div')[1].text
        mram_pross = ficha.find_all('div')[2].text
        batery = ficha.find_all('div')[3].text

        minidf = pd.DataFrame({'name':name,
                               'image_link':image_link,
                               'link':link,
                               'screen':screen,
                               'camera':camera,
                               'processor_RAM':mram_pross,
                               'batery':batery},index=[0])
        results = pd.concat([results, minidf])

    return results

In [6]:
def get_smartphone_info_from_all_pages(pages):
    """This function is the main function for getting all products information from maiscelular.com.
    It receives an array of pages, iterates over it and gets all bs4 elements (soup) and extracts information,
    returning a dataframe"""

#     results.reset_index(drop=True, inplace=True)

    results = pd.DataFrame()

    for page in tqdm(pages):
        url = get_url(page)
        soup = get_soup_from_url(url)
        results = pd.concat([results, get_smartphone_info_from_page(soup)])
        
    return results

### mainfunction

In [7]:
def exctract_from_maiscelular():
    """This function executes all functions and data from the search pages. Each product is a smartphone, and
    the link to it will give more detailed information. This function returns a dataframe."""
    url = get_url(1)
    soup = get_soup_from_url(url)
    pages = get_array_of_pages(soup)
    print(f'number of pages: {pages[-1]}')
    time.sleep(1)
    results = get_smartphone_info_from_all_pages(pages).reset_index(drop=True)
    
    return results

In [8]:
# results = exctract_from_maiscelular()
results = pd.read_csv('storage/'+'maiscelular_backup_main_page2020y-5m-16d-22h.csv', encoding='cp1252')

In [9]:
# results['price_link'] = results.loc[:,'link'].apply(lambda x: 'https://www.maiscelular.com.br//fichas-tecnicas' + '/'.join(x.split('fichas-tecnicas')[1].split('/')[0:3]) + '/')

## Extraction 2: tecnical features

### subfunctions

In [10]:
def remove_accentuation_from_cols(dataframe):
    """This function removes accentuation from columns names. Receives a dataframe and returns the treated DataFrame"""

    
    from unidecode import unidecode
    
    unidecode_dict = {}
    for col in dataframe.columns.tolist():
        unidecode_dict[col] = unidecode(col)
    dataframe.rename(columns=unidecode_dict, inplace=True)        
    
    return dataframe

In [11]:
def add_missing_columns(dataframe1, dataframe2):
    """THis function receives a dataframe1 and a dataframe2, and checks if there is a column in dataframe1
    that is not in datafram2. If so, it creates the missing column in dataframe2 with 'sem registro' as value
    and reorders the dataframe1 to have the same columns order."""
    
    #verify if all columns match. If not, add the column...
    cols_not_in_search = []
    for col in dataframe1.columns:
        if col not in dataframe2.columns:
#             print('Propriedade não encontrada: ', col)
            dataframe2[col] = 'sem registro'

    return dataframe2

In [12]:
def reorder_columns(dataframe1, dataframe2):
    """This function receives two dataframes, compare their columns order, and return the second
    adjusted with the same order of the first."""
    
    while dataframe1.columns.tolist() != dataframe2.columns.tolist():
        print('looks like dataframe1 and dataframe2 have columns ordered in a different way...')
        for col_df1, col_df2 in zip(dataframe1.columns, dataframe2.columns):
            if col_df1 != col_df2:
                print('reorganizing...')
                column_to_be_put_in_the_end = dataframe2.loc[:,col_df2]
                dataframe2.drop(columns=col_df2, inplace=True)
                dataframe2[col_df2] = column_to_be_put_in_the_end
                break
        print('checking again...')
    return dataframe2

In [13]:
def rename_duplicate_columns(dataframe):

    lista = dataframe.columns.tolist()
    checklist = []

    for element in lista:

        if element not in checklist:
            checklist.append(element)
        else:
            checklist.append(element + '.1')

    dataframe.columns = checklist
    
    return dataframe

### mainfunction

In [14]:
def extract_data_from_each_smartphone_link(first_results, backup_path='smartphones_backup.csv'):
    """This function receives a dataframe result from the extractions of the main pages of smartphones from
    maiscelular.com and extract information from each smartphone and returns it as a dataframe.
    It also saves a dataframe in a backup. It there is no backup, it starts from the beginning,
    however, in the contrary case, it loads the backup and restarts from where it stopped."""
    
    # load backup
    backup = pd.read_csv(backup_path, encoding='utf-8')
    backup = remove_accentuation_from_cols(backup)
    
    new_results = backup.copy()
    
    # check which links will be searched
    links_already_searched = backup.link.unique()
    mask = first_results.link.apply(lambda x: x not in links_already_searched)
    links_not_searched = first_results.loc[mask].link
    
    if len(links_not_searched) == 0:
        print('There are no new smartphones to extract!')
        return backup 
    
    print(f'Extracting information from {len(links_not_searched)} new smartphones...')
    time.sleep(1)    
    for link in tqdm(links_not_searched):
        df_html = pd.read_html(link,encoding='utf-8')[0].drop(columns=[2,3,4])

        #removing rows that are the divisros of the extracted table.
        smartphone_name = df_html.loc[0][1]
        indexes_to_remove = df_html.loc[df_html[1] == smartphone_name].index.drop(0)
        df_html.drop(indexes_to_remove,inplace=True)

        #now we transpose the dataframe and rename the first column which is the smartphone name
        df_html = df_html.set_index(0).T.rename(columns={'Características':'Smartphone'})

        #adding link
        df_html['link'] = link
        
        #treat new result, remove accentuation and reset_index
        df_html = remove_accentuation_from_cols(df_html)
        df_html = rename_duplicate_columns(df_html)
        df_html = add_missing_columns(new_results, df_html)
        
        # add to the final result and backup
        new_results = pd.concat([new_results, df_html], ignore_index=True).reset_index(drop=True)
        new_results.to_csv(backup_path, encoding='utf-8', index=False)
    
    return new_results

**Extract results and prices**

In [15]:
new_results= extract_data_from_each_smartphone_link(results)

There are no new smartphones to extract!


## Extraction 3: prices

Notice, its better to separate the extractions (tecnical data and price) , because one uses pd.read_html() and the other will use BeautifulSoup() (respectively).

In [16]:
df = pd.read_csv('backupprices.csv', encoding='utf-8')
tosearch = df.loc[df.name.apply(lambda x: type(x) == float)]
indexes = tosearch.index
print(len(indexes), ' links left')

for index in tqdm(indexes):
    link = df.loc[index,'price_link']
    soup = get_soup_from_url(link)
    df.loc[index,'name'] = soup.find_all('h1', attrs={'itemprop':"headline"})[0].text
    try:
        df.loc[index,'best_price'] = soup.find_all('a', attrs={'class':"best"})[0].text
    except:
        df.loc[index,'best_price'] = 'sem_preco'
    df.to_csv('backupprices.csv', encoding='utf-8', index=False)

  0%|          | 0/1441 [00:00<?, ?it/s]

1441  links left


100%|██████████| 1441/1441 [36:25<00:00,  1.52s/it]


# Transform: Merge everything

**Merge**

In [83]:
prices = pd.read_csv('backupprices.csv', encoding='utf-8')
prices.loc[:,'name'] = prices.loc[:,'name'].apply(lambda x: x.strip(' Ficha Técnica'))

In [84]:
smartphones = pd.read_csv('smartphones_backup.csv', encoding='utf-8')

In [85]:
finalresults = pd.concat([smartphones, prices.drop(columns='name')], axis=1)

**Format best_price column**

In [86]:
# remove 'R$'
finalresults.loc[:,'best_price'] = finalresults.loc[:,'best_price'].apply(lambda x: x.split('R$')[1] if len(x.split('R$')) > 1 else x)

# remove dots
finalresults.loc[:,'best_price'] = finalresults.loc[:,'best_price'].apply(lambda x: x.replace('.','') if '.' in x else x)
finalresults.loc[:,'best_price'] = finalresults.loc[:,'best_price'].apply(lambda x: np.nan if x == 'sem_preco' else x)

#turn values from string to floats
finalresults.loc[:,'best_price'] = finalresults.loc[:,'best_price'].apply(lambda x: float(x))

**Export**

In [95]:
finalresults.to_csv('smartphones.csv', encoding='utf-8', index=False)