# Introdução a Ciência dos Dados - Atividade 1

## Sumário

- [Scrapping](#Scrapping)

## Scrapping

#### Objetivo

Observar as ofertas de iphone no olx no estado do ceará

#### Bibliotecas

In [37]:
# Imports
import requests
from bs4 import BeautifulSoup
import json

import numpy as np
import pandas as pd

# python3 -m pip install beautifulsoup4

#### Ofertas

In [2]:
# Pages
MAX_PAGES = 30

# URL
product = 'iphone'
offers_url = f'https://www.olx.com.br/estado-ce?q={product}'

# Faking a browser
headers = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

#
offers = []

#
for page in range(1, MAX_PAGES+1):
    # Loading
    response = requests.get(f'{offers_url}&o={page}', headers=headers)
    html = response.text
    soup = BeautifulSoup(html, "html.parser")

    #
    ul = soup.find('ul', {'id': 'ad-list'})
    
    #
    for li in ul:
        a = li.find('a')
        
        # Doesn't has an offer
        if a is None:
            continue
        
        #
        paid_promotion = a.find('span', {'aria-label': 'Item impulsionado.'})
        
        offers.append({
            'url': a.attrs.get('href', None),
            'paidPromotion': paid_promotion is not None
        })
    
    #
    print(f'\rPage {page} of {MAX_PAGES}: {len(offers)} offers', end='')

Page 30 of 30: 1532 offers

#### Capturando dados de cada oferta

In [128]:
for i, offer in enumerate(offers):  
    # Safety
    if 'scrapped' in offer:
        print(f'\rOffer {i + 1} of {len(offers)}', end='')
        continue
        
    try:
        # Loading
        response = requests.get(offer['url'], headers=headers)
        html = response.text
        soup = BeautifulSoup(html, "html.parser")

        # Script tag
        script = None    
        for s in soup.find_all('script'):
            if s.text.startswith('window.dataLayer ='):
                script = s
                break

        # Safety
        if script is None:
            print(f'Offer {i} not available')
            continue

        # Data
        idx = script.text.find('=')
        text = script.text[idx + 1:].strip()
        data = json.loads(text)[0]

        #
        offer['id']       = data['listId']
        offer['state']    = data['state']
        offer['region']   = data['region']
        offer['category'] = data['category']
        offer['pictures'] = data['pictures']

        #
        page     = data['page']
        detail   = data['page']['detail']
        adDetail = data['page']['adDetail']

        offer['pageType']   = page['pageType']
        offer['adDate']     = detail['adDate']
        offer['zipcode']    = detail['zipcode']
        offer['price']      = detail['price']

        offer['sellerName'] = adDetail['sellerName']
        offer['areaCode']   = adDetail['ddd']
        offer['subject']    = adDetail['subject']
        offer['brand']      = adDetail['brand']
        offer['model']      = adDetail['model']
        offer['version']    = adDetail['version']
        offer['gearbox']    = adDetail['gearbox']

        offer['eletronicsBrand']     = adDetail.get('electronics_brand', None)
        offer['eletronicsModel']     = adDetail.get('electronics_model', None)
        offer['eletronicsCondition'] = adDetail.get('electronics_condition', None)
        offer['storage']             = adDetail.get('cellphone_storage', None)
        offer['color']               = adDetail.get('electronics_color', None)
        offer['batteryHealth']       = adDetail.get('electronics_battery_health', None)

        #
        offer['scrapped'] = True

        print(f'\rOffer {i + 1} of {len(offers)}', end='')
    except:
        print(f'\rError on offer {i + 1} of {len(offers)}')
        

Error on offer 59 of 1532
Error on offer 68 of 1532
Error on offer 316 of 1532
Error on offer 585 of 1532
Error on offer 593 of 1532
Error on offer 628 of 1532
Error on offer 1021 of 1532
Error on offer 1176 of 1532
Error on offer 1314 of 1532
Offer 1532 of 1532

#### Convertendo para Dataframe

In [225]:
#
columns = ['id', 'pageType', 'adDate', 'state', 'region', 'zipcode', 
           'areaCode', 'category', 'paidPromotion', 'subject', 'brand',
           'eletronicsBrand', 'model', 'eletronicsModel', 'storage', 
           'color', 'batteryHealth', 'eletronicsCondition', 'version', 
           'price', 'pictures', 'sellerName', 'gearbox', 'url', 'scrapped']

# Converting
df = pd.DataFrame(offers).reindex(columns=columns)

# Dropping not scrapped
df = df[df.scrapped == True]

# Dropping a few columns
df.drop(['pageType', 'brand', 'model', 'version', 'gearbox', 'scrapped'], axis=1, inplace=True)

#
print(df.shape)
df.head(5)

(1523, 19)


Unnamed: 0,id,adDate,state,region,zipcode,areaCode,category,paidPromotion,subject,eletronicsBrand,eletronicsModel,storage,color,batteryHealth,eletronicsCondition,price,pictures,sellerName,url
0,1188658000.0,1684963000.0,CE,Fortaleza,60135270,85,Celulares e telefonia,True,IPhone XR-256gb,APPLE,IPHONE XR,256GB,Preto,Boa (80% até 94%),Usado - Excelente,2800,5.0,Daniel,https://ce.olx.com.br/fortaleza-e-regiao/celul...
1,1187839000.0,1684962000.0,CE,Fortaleza,60440180,85,Celulares e telefonia,True,Iphone 12 128gb muito conservado.,APPLE,IPHONE 12,128GB,Dourado,Boa (80% até 94%),Usado - Excelente,3000,5.0,Emanuel,https://ce.olx.com.br/fortaleza-e-regiao/celul...
2,1175826000.0,1684962000.0,CE,Fortaleza,60810820,85,Celulares e telefonia,True,iPhone 11 128gb único dono com Nota Fiscal,APPLE,IPHONE 11,128GB,Preto,Boa (80% até 94%),Usado - Excelente,2400,6.0,H Colares,https://ce.olx.com.br/fortaleza-e-regiao/celul...
3,1189645000.0,1684968000.0,CE,Fortaleza,60710680,85,Celulares e telefonia,True,Vendo iPhone 13 128gb,APPLE,IPHONE 13,128GB,Preto,Boa (80% até 94%),Usado - Excelente,3800,6.0,Jessica sousa,https://ce.olx.com.br/fortaleza-e-regiao/celul...
4,1189676000.0,1684964000.0,CE,Fortaleza,60332400,85,Celulares e telefonia,False,IPhone 12 256 Gb 86%,APPLE,IPHONE 12,256GB,Azul,Boa (80% até 94%),Novo,3150,2.0,Lucas Avila,https://ce.olx.com.br/fortaleza-e-regiao/celul...


#### Salvando

In [227]:
#
OVERWRITE = False

# Saving
if OVERWRITE:
    df.to_csv('../data/iphone-olx-ce.csv')
    print('Saved!')