# Web scraping jacotei.com.br

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import datetime
import os
from sqlalchemy import create_engine
from tqdm.auto import tqdm

## Function that sews url

In [2]:
exemplo = 'https://www.jacotei.com.br/busca/'

def get_all_jacotei_cellphones(page=1,number=10000):
    '''
    This function gets all cellphones listed on https://www.jacotei.com.br.
    pages: which page to search.
    number: how many cellphones per page
    '''
    
    url_base = 'https://www.jacotei.com.br/busca/?cids=57&bids=&fids=&o=2'
    pages= f'&p={page}'
    number= f'&n={number}'
    
    url = url_base + pages + number

    print('loading response...')
    response = requests.get(url)

    print('loading soup...')
    soup = BeautifulSoup(response.content)
    minisoup = soup.find_all('div', attrs={'id':'produtos'})[0]

    print('loading products...')
    #<article class="produtosS col-lg-4 col-md-4 col-sm-6 col-xs-12 produtos_vertical"> 
    html_chunk_products = minisoup.find_all('article', attrs={'class':'produtosS col-lg-4 col-md-4 col-sm-6 col-xs-12 produtos_vertical'})

    df = pd.DataFrame()

    for product_html in tqdm(html_chunk_products):
                
        # <div class="carousel-inner" role="listbox">      LINK
        link = product_html.find_all('a',attrs={'rel':"nofollow"})[0]['href']

        # <h3 class="text-center hidden-sm hidden-lg hidden-md"> <a rel="nofollow" ...> NOME
        name = product_html.find_all('h3', attrs={'class':"text-center hidden-sm hidden-lg hidden-md"})[0].find_all('a',attrs={'rel':"nofollow"})[0].text

        #getting prices
        price_html_chunk = product_html.find_all('span', attrs={'class':"menorPrecoDestaque"})
        faixa_preco = price_html_chunk[0].text.strip('\n')
        
        #time since started in Jacotei
        since = product_html.find_all('p',attrs={'class':'text-center desde'})[0].text.strip()
        
        #image_info. one way to get the website which the link will get you, is by looking
        #at the image!
        img = product_html.find_all('img')[0]['data-original']

        if len(price_html_chunk) == 1:
            menorPrecoDestaque = faixa_preco

        else:
            menorPrecoDestaque = price_html_chunk[1].text

        my_dict = {'nome': name,      
                    'faixa_preco':faixa_preco,
                   'menor_preco':menorPrecoDestaque,
                  'piece_link': link,
                  'since':since,
                  'img':img}

        minidf = pd.DataFrame(my_dict,index=[0])
        df = pd.concat([df,minidf])

    df = df.reset_index(drop=True)
    return df

## Function for cleaning columns

In [3]:
def treat_columns(dataframe):

    '''This function is used for treating columns from the obtained dataframe from
    jacotei.com.br, and also renaming them.
    
    it receives a DF and returns a treated DF
    
    '''

    #treat columns 
    dataframe.loc[:,'since'] = dataframe.loc[:,'since'].apply(lambda x: x.split('desde')[1])

    # treating links:
    # > firt, add 'https://www.jacotei.com.br' to jacotei links, by checking if starts with '/' (its the only one)

    dataframe.loc[:,'piece_link'] = dataframe.loc[:,'piece_link'].apply(lambda x: 'https://www.jacotei.com.br' + x\
                                      if x.startswith('/') else x)
    # treating 'img', which will be the future 'destino_do_link' column
    # > drop '//img.i' from //img.ijacotei
    # > drop 'https's


    dataframe.loc[:,'img'] = dataframe.loc[:,'img'].apply(lambda x: 'jacotei.com.br'\
                              if x.startswith('//img.ijacotei') else x)

    dataframe.loc[:,'img'] = dataframe.loc[:,'img'].apply(lambda x: x.split('/')[2]\
                              if x.startswith('http') else x)

    # > drop 'www's
#     dataframe.loc[:,'img'] = dataframe.loc[:,'img'].apply(lambda x: x.split('www.')[0]\
#                               if x.startswith('www') else x)

#     #starts with imagens-
#     dataframe.loc[:,'img'] = dataframe.loc[:,'img'].apply(lambda x: x.split('images-')[1]\
#                               if x.startswith('images-') else x)

    #rename columns:
    dataframe.columns = ['modelo_celular', 'menor_preco', 'maior_preco',
                       'link_aunicio', 'data_anuncio', 'destino_do_link']

    # reorder columns:
    dataframe = dataframe.loc[:,['modelo_celular', 'maior_preco','menor_preco', 'data_anuncio', 'destino_do_link', 'link_aunicio']]
    
    print('Dataframe treated successfully!')
    
    return dataframe

## Function for storing data in csv

In [27]:
df = pd.DataFrame()
df.to_csv()

In [4]:
def save_dataframe(dataframe, prefix=r'jct_data_'):
    '''
    This function saves a dataframe and stores it by a name and current YYYY/MM/DD of saving.
    '''
    
    import datetime
    t = datetime.datetime.now()
    time = f'{t.year}_{t.month}_{t.day}'
    
    path = '../data_storage/' + prefix + f'{time}' + '.csv'
    
    # now saving the obtained data in csvs
    dataframe.to_csv(path,sep=',',index=True, na_rep='NaN', encoding='cp1252')
    
    
    print('DataFrame Saved!')

In [5]:
def timestamp():

    import datetime
    t = datetime.datetime.now()
    time = f'{t.year}-{t.month}-{t.day}'
    
    return time

# Running & Saving

In [2]:
df = pd.DataFrame([1,23,4,5])

In [None]:
df.to_sql()

In [6]:
# obtaining results
results = get_all_jacotei_cellphones()

# treating results
results_treated = treat_columns(results)

# create a timestamp
results_treated['timestamp'] = timestamp()

# saving the treated results
save_dataframe(results_treated)

#now save to a postgresSQL DB
engine = create_engine('postgresql+psycopg2://postgres:123qweasd@localhost/Smartphones-DB')
conn = engine.connect()

results_treated.to_sql('jacotei', conn, index=False, if_exists='append')
print('Successfully saved on database!')

loading response...
loading soup...
loading products...


HBox(children=(FloatProgress(value=0.0, max=2104.0), HTML(value='')))


Dataframe treated successfully!
DataFrame Saved!
successfully saved on database!


# Tests:

In [56]:
response = requests.get(r'https://stackoverflow.com/questions/37400974/unicode-error-unicodeescape-codec-cant-decode-bytes-in-position-2-3-trunca')

In [45]:
backup_folder= r"C:\Users\vFarn\Documents\ironhack\personal_projs\phone-price-tracker\data_storage\backup_folder\backup_df.csv"
df = pd.read_csv(backup_folder, encoding='cp1252')
df['timestamp'] = 'blob'
df

Unnamed: 0,modelo_celular,maior_preco,menor_preco,data_anuncio,destino_do_link,link_aunicio,timestamp
0,"iPhone XS Apple com 512GB, Tela Super Retina H...","R$ 9.529,00","R$ 9.529,00",Mar/2020,www.pontofrio-imagens.com.br,https://track2.jacotei.com.br/r?h=eJxNUU1v2kAQ...,blob
1,iPhone 11 Pro Max 512GB Prateado iOS 4G + Wi-F...,"R$ 8.927,07","R$ 8.927,07",Fev/2020,images-americanas.b2w.io,https://track2.jacotei.com.br/r?h=eJxVUcFq4zAU...,blob
2,"iPhone 11 Pro Max Apple com 512GB, Tela Retina...","R$ 8.648,58","R$ 8.648,58",Mar/2020,www.pontofrio-imagens.com.br,https://track2.jacotei.com.br/r?h=eJxNUctu20AM...,blob
3,"iPhone 11 Pro Max Apple com 512GB, Tela Retina...","R$ 8.447,12","R$ 8.447,12",Nov/2019,www.pontofrio-imagens.com.br,https://track2.jacotei.com.br/r?h=eJxNUUtv2zAM...,blob
4,"iPhone 11 Pro Max Apple com 512GB, Tela Retina...","R$ 8.447,12","R$ 8.447,12",Jan/2020,www.casasbahia-imagens.com.br,https://track2.jacotei.com.br/r?h=eJxNUk1v2zAM...,blob
...,...,...,...,...,...,...,...
1998,Celular Alcatel 1011 Desbloqueado GSM Dual Chi...,"R$ 69,00","R$ 69,00",Nov/2017,jacotei.com.br,https://www.jacotei.com.br/celular-alcatel-101...,blob
1999,Celular Red Mobile Fit Music M011F Desbloquead...,"R$ 79,99","R$ 66,49",Jan/2020,jacotei.com.br,https://www.jacotei.com.br/celular-red-mobile-...,blob
2000,Celular Positivo P25 Desbloqueado GSM Dual Chi...,"R$ 84,90","R$ 66,40",Mai/2017,jacotei.com.br,https://www.jacotei.com.br/celular-positivo-p2...,blob
2001,Celular Positivo P25 Desbloqueado GSM Dual Chi...,"R$ 36,00","R$ 36,00",Ago/2019,jacotei.com.br,https://www.jacotei.com.br/celular-positivo-p2...,blob


In [66]:
t = datetime.datetime.now()

In [72]:
month = (lambda x: '0' + x if len(x) == 1 else x)(f'{t.month}')
day = (lambda x: '0' + x if len(x) == 1 else x)(f'{t.day}')

'04-05'

In [73]:
t = datetime.datetime.now()

year = f'{t.year}'
month = (lambda x: '0' + x if len(x) == 1 else x)(f'{t.month}')
day = (lambda x: '0' + x if len(x) == 1 else x)(f'{t.day}')

year + '_' + month + '_' + day

'2020_04_05'

In [75]:
not False

True

In [8]:
import os
os.listdir('../data_storage')

['dblogs', 'raw', 'temp', 'transformed']

In [1]:
import sqlalchemy as db

In [15]:
from datetime import datetime
import os

class Params:
    """
    Parameters for the jacotei extraction pipeline.
    """
    #
    raw_data = '../data_storage/raw/jctraw' + datetime.now().strftime("%Y-%m-%d") + '.csv'
    trans_data = '../data_storage/transformed/jcttreated' + datetime.now().strftime("%Y-%m-%d") + '.csv'
    temp_data = '../data_storage/temp/toDB' + datetime.now().strftime("%Y-%m-%d") + '.csv'
    db_data = '../data_storage/dblogs/dblog' + datetime.now().strftime("%Y-%m-%d") + '.csv'

    ## Database connection params
    user = 'postgres'
    password = '123qweasd'
    host = 'localhost'
    database = 'Smartphones-DB'
    table_name = 'jacotei'
    #table_name='jct' + datetime.now().strftime("%Y-%m-%d")

    #Info specific about webscrapping
    force_execution = False

    # parameters for data_extraction
    page=1
    number_per_page=10000
    url = 'https://www.jacotei.com.br/busca/?cids=57&bids=&fids=&o=2' +  f'&p={page}' + f'&n={number_per_page}'

    #parameters for data_storage
    prefix=r'jct_data_'
    
    #time params
    t = datetime.now()
    year = f'{t.year}' 
    month = (lambda x: '0' + x if len(x) == 1 else x)(f'{t.month}')
    day = (lambda x: '0' + x if len(x) == 1 else x)(f'{t.day}')

    # YYYY-MM-DD
    timestamp = year + '-' + month + '-' + day

In [17]:
params = Params
engine = create_engine(f'postgresql+psycopg2://{params.user}:{params.password}@{params.host}/{params.database}')
conn = engine.connect()

In [25]:
dates = pd.read_sql_query('SELECT DISTINCT timestamp FROM jacotei;', con=conn).loc[:,'timestamp'].unique()
params.timestamp in dates


True

In [20]:
params.timestamp in dates


True