# Web scraping jacotei.com.br

In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
import pandas as pd
import re
import datetime
import os
from sqlalchemy import create_engine

## Function that sews url

In [2]:
exemplo = 'https://www.jacotei.com.br/busca/'

def get_all_jacotei_cellphones(page=1,number=10000):
    '''
    This function gets all cellphones listed on https://www.jacotei.com.br.
    pages: which page to search.
    number: how many cellphones per page
    '''
    
    url_base = 'https://www.jacotei.com.br/busca/?cids=57&bids=&fids=&o=2'
    pages= f'&p={page}'
    number= f'&n={number}'
    
    url = url_base + pages + number

    print('loading response...')
    response = requests.get(url)

    print('loading soup...')
    soup = BeautifulSoup(response.content)
    minisoup = soup.find_all('div', attrs={'id':'produtos'})[0]

    print('loading products...')
    #<article class="produtosS col-lg-4 col-md-4 col-sm-6 col-xs-12 produtos_vertical"> 
    html_chunk_products = minisoup.find_all('article', attrs={'class':'produtosS col-lg-4 col-md-4 col-sm-6 col-xs-12 produtos_vertical'})

    df = pd.DataFrame()

    for product_html in tqdm(html_chunk_products):
                
        # <div class="carousel-inner" role="listbox">      LINK
        link = product_html.find_all('a',attrs={'rel':"nofollow"})[0]['href']

        # <h3 class="text-center hidden-sm hidden-lg hidden-md"> <a rel="nofollow" ...> NOME
        name = product_html.find_all('h3', attrs={'class':"text-center hidden-sm hidden-lg hidden-md"})[0].find_all('a',attrs={'rel':"nofollow"})[0].text

        #getting prices
        price_html_chunk = product_html.find_all('span', attrs={'class':"menorPrecoDestaque"})
        faixa_preco = price_html_chunk[0].text.strip('\n')
        
        #time since started in Jacotei
        since = product_html.find_all('p',attrs={'class':'text-center desde'})[0].text.strip()
        
        #image_info. one way to get the website which the link will get you, is by looking
        #at the image!
        img = product_html.find_all('img')[0]['data-original']

        if len(price_html_chunk) == 1:
            menorPrecoDestaque = faixa_preco

        else:
            menorPrecoDestaque = price_html_chunk[1].text

        my_dict = {'nome': name,      
                    'faixa_preco':faixa_preco,
                   'menor_preco':menorPrecoDestaque,
                  'piece_link': link,
                  'since':since,
                  'img':img}

        minidf = pd.DataFrame(my_dict,index=[0])
        df = pd.concat([df,minidf])

    df = df.reset_index(drop=True)
    return df

## Function for cleaning columns

In [3]:
def treat_columns(dataframe):

    '''This function is used for treating columns from the obtained dataframe from
    jacotei.com.br, and also renaming them.
    
    it receives a DF and returns a treated DF
    
    '''

    #treat columns 
    dataframe.loc[:,'since'] = dataframe.loc[:,'since'].apply(lambda x: x.split('desde')[1])

    # treating links:
    # > firt, add 'https://www.jacotei.com.br' to jacotei links, by checking if starts with '/' (its the only one)

    dataframe.loc[:,'piece_link'] = dataframe.loc[:,'piece_link'].apply(lambda x: 'https://www.jacotei.com.br' + x\
                                      if x.startswith('/') else x)
    # treating 'img', which will be the future 'destino_do_link' column
    # > drop '//img.i' from //img.ijacotei
    # > drop 'https's


    dataframe.loc[:,'img'] = dataframe.loc[:,'img'].apply(lambda x: 'jacotei.com.br'\
                              if x.startswith('//img.ijacotei') else x)

    dataframe.loc[:,'img'] = dataframe.loc[:,'img'].apply(lambda x: x.split('/')[2]\
                              if x.startswith('http') else x)

    # > drop 'www's
#     dataframe.loc[:,'img'] = dataframe.loc[:,'img'].apply(lambda x: x.split('www.')[0]\
#                               if x.startswith('www') else x)

#     #starts with imagens-
#     dataframe.loc[:,'img'] = dataframe.loc[:,'img'].apply(lambda x: x.split('images-')[1]\
#                               if x.startswith('images-') else x)

    #rename columns:
    dataframe.columns = ['modelo_celular', 'menor_preco', 'maior_preco',
                       'link_aunicio', 'data_anuncio', 'destino_do_link']

    # reorder columns:
    dataframe = dataframe.loc[:,['modelo_celular', 'maior_preco','menor_preco', 'data_anuncio', 'destino_do_link', 'link_aunicio']]
    
    print('Dataframe treated successfully!')
    
    return dataframe

## Function for storing data in csv

In [4]:
def save_dataframe(dataframe, prefix=r'jct_data_'):
    '''
    This function saves a dataframe and stores it by a name and current YYYY/MM/DD of saving.
    '''
    
    import datetime
    t = datetime.datetime.now()
    time = f'{t.year}_{t.month}_{t.day}'
    
    path = '../data_storage/' + prefix + f'{time}' + '.csv'
    
    # now saving the obtained data in csvs
    dataframe.to_csv(path,sep=',',index=True, na_rep='NaN', encoding='cp1252')
    
    print('DataFrame Saved!')

In [5]:
def timestamp():

    import datetime
    t = datetime.datetime.now()
    time = f'{t.year}-{t.month}-{t.day}'
    
    return time

# Running & Saving

In [6]:
# obtaining results
results = get_all_jacotei_cellphones(number=10000)

# treating results
results_treated = treat_columns(results)

# create a timestamp
results_treated['timestamp'] = timestamp()

# saving the treated results
save_dataframe(results_treated)

#now save to a postgresSQL DB
engine = create_engine('postgresql+psycopg2://postgres:123qweasd@localhost/Smartphones-DB')
conn = engine.connect()

results_treated.to_sql('jacotei', conn, index=False, if_exists='append')
print('Successfully saved on database!')

loading response...
loading soup...
loading products...


HBox(children=(FloatProgress(value=0.0, max=2104.0), HTML(value='')))


Dataframe treated successfully!
DataFrame Saved!
successfully saved on database!
