In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import pandas as pd
import re
from datetime import datetime

In [3]:
def extract_price_canvas(url):
    
    response = requests.get(url)
    
    if response.status_code != 200:
        print('response not [200]!!!')

    soup = BeautifulSoup(response.content)
    
    model = soup.find_all('h1', attrs = {'itemprop':"name"})[0].text
    scripts = soup.find_all('script', attrs = {'type':'text/javascript'})
    
    storage = []

    for script in scripts:
        if 'function grafico()' in script.text:
            storage.append(script)
        elif 'categories:' in script.text:
            storage.append(script)

    storage2 = []

    for trecho in storage:
        for piece in re.findall('\[(.*?)\]',trecho.text):
            storage2.append(piece)

    numeros = []
    datass = []

    for element in storage2:    
        #detecta 1o e 2o de numeros
        if element.split(',')[0][0] in ['1','2','3','4','5','6','7','8','9']:
            numeros.append(element)

        #detecta se string é um ano, como data
        elif element.split('-')[0][3] in ['1','2','3','4','5','6','7','8','9']:
            datass.append(element)

    df = pd.DataFrame()

    for nlist in numeros:
        minidf = pd.DataFrame([float(x) for x in nlist.split(',')])
        df = pd.concat([df,minidf],axis=1)

    for ndata in datass:
        minidf = pd.DataFrame([x.strip('"') for x in ndata.split(',')])
        df = pd.concat([df,minidf],axis=1)
    
    df.columns = ['maxprice', 'minprice1', 'minprice2', 'date']
    df['model'] = model
    df['url'] = url

    return df

In [4]:
dados = pd.read_csv('../data_storage/transformed/jcttreated2020-05-02.csv', encoding='cp1252')

urls = dados.link_aunicio.unique()

In [5]:
results = pd.DataFrame()

for url in tqdm(urls):
    try:
        data = extract_price_canvas(url)
    except:
        model = dados.loc[dados.link_aunicio == url].iloc[0,0]
        
        data = pd.DataFrame({'maxprice':'not possible', 
                             'minprice1':'not possible', 
                             'minprice2':'not possible',
                             'date':'not possible',
                             'model':model,
                             'url':url}, index=[0])
        
    results = pd.concat([results,data]).reset_index(drop=True)

 48%|████▊     | 1092/2272 [11:51<09:44,  2.02it/s]

response not [200]!!!


100%|██████████| 2272/2272 [25:42<00:00,  1.47it/s]  


In [11]:
results.iloc[1091,:]

maxprice                                               9178.98
minprice1                                              5949.15
minprice2                                              5949.15
date                                                2019-12-08
model        iPhone Apple 11 Pro 64GB Desbloqueado 5.8 Cinz...
url          https://www.jacotei.com.br/iphone-apple-11-pro...
Name: 1091, dtype: object

In [8]:
results.head(20)

Unnamed: 0,maxprice,minprice1,minprice2,date,model,url
0,not possible,not possible,not possible,not possible,"Apple iPhone 8 Plus 5,5 polegadas 4G LTE 12MP ...",https://track2.jacotei.com.br/r?h=eJxVUdGK2zAQ...
1,not possible,not possible,not possible,not possible,"Apple iPhone 8 Plus 5,5 polegadas 4G LTE 12MP ...",https://track2.jacotei.com.br/r?h=eJxVUdGK2zAQ...
2,not possible,not possible,not possible,not possible,iPhone 11 Pro Max 512GB Prateado iOS 4G + Wi-F...,https://track2.jacotei.com.br/r?h=eJxVUcFq4zAU...
3,not possible,not possible,not possible,not possible,"64G 5,0 5,5 polegadas Smart Phone 7+12MP Telef...",https://track2.jacotei.com.br/r?h=eJxVUWFr2zAU...
4,8499,8499,8499,2020-03-13,Smartphone Samsung Galaxy S20 Ultra SM-G988B D...,https://www.jacotei.com.br/smartphone-samsung-...
5,8499,8499,8499,2020-03-23,Smartphone Samsung Galaxy S20 Ultra SM-G988B D...,https://www.jacotei.com.br/smartphone-samsung-...
6,8499,8074.05,8074.05,2020-04-04,Smartphone Samsung Galaxy S20 Ultra SM-G988B D...,https://www.jacotei.com.br/smartphone-samsung-...
7,8499,8074.05,8074.05,2020-04-08,Smartphone Samsung Galaxy S20 Ultra SM-G988B D...,https://www.jacotei.com.br/smartphone-samsung-...
8,8499,7999,7999,2020-04-16,Smartphone Samsung Galaxy S20 Ultra SM-G988B D...,https://www.jacotei.com.br/smartphone-samsung-...
9,8499,7999,7999,2020-04-19,Smartphone Samsung Galaxy S20 Ultra SM-G988B D...,https://www.jacotei.com.br/smartphone-samsung-...


In [12]:
now = datetime.now()
now = str(now.year) +'y-'+ str(now.month) +'m-'+ str(now.day)+'d-'+ str(now.hour)+'h'

results.to_csv('phone_prices'+now+'.csv',sep=',',index=False, na_rep='NaN', encoding='cp1252')

# TESTING

In [189]:
df.link_aunicio.apply(lambda x: x.split('.com')[0].split('//')[1]).unique()

array(['produto.casasbahia', 'www.amazon', 'produto.pontofrio',
       'produto.extra', 'www.carrefour', 'www.fastshop',
       'www.tabaratotolevando', 'www.supermuffato', 'lojasmm',
       'tracker.adtools1', 'www.cissamagazine', 'ad.zanox', 'www.colombo',
       'www.onofreagora', 'www.liberatti', 'clube2.magazineluiza'],
      dtype=object)

In [62]:
lojas = ['www.lojasmm.com','shopfato.vteximg.com.br','29028l.ha.azioncdn.net', 
         'suc.webcontinental.com.br', 'images-shoptime.b2w.io','statics.angeloni.com.br',
         'upload.onofreagora.com.br','a-static.mlcdn.com.br']

links = [df.loc[df.destino_do_link == loja].link_aunicio.iloc[0] for loja in lojas]

In [217]:
websites = ['tabaratotolevando', 'supermuffato', 'cissamagazine', 'americanas',
       'webcontinental', 'shoptime', 'angeloni', 'onofreagora',
       'magazineluiza', 'casasbahia', 'amazon', 'pontofrio', 'extra',
       'carrefour', 'fastshop', 'lojasmm', 'CissaMagazine', 'compracerta',
       'colombo', 'liberatti']

In [216]:
len(websites

23