# Define scraping functions

In [5]:
import requests
import time
import re
from bs4 import BeautifulSoup


def fetch_finn_codes(session: requests.Session, first=1, last=5):
    X_FINNKODE = "//*[@id='page-results']/div[1]/div/div/div/div[2]/div/a"
    URL_RESULTS = 'http://m.finn.no/car/used/search.html'
    codes = []
    for i in range(first, last):
        print("Page {}/{}".format(i, last))
        result = session.get(URL_RESULTS, params={'page': i})
        page = result.text
        doc = BeautifulSoup(page, 'lxml')
        ad_ids = [
            element.get('id')
            for element in doc.find_all('a')
            if element.get('class') and 'userhistory' in element.get('class')
        ]
        codes += ad_ids
    return codes


def fetch_ads(session, finn_codes, dt=0.25):
    URL_FETCH_AD = 'http://m.finn.no/car/used/ad.html?finnkode='
    ads = []
    for i, finn_code in enumerate(finn_codes):
        if not i % (len(finn_codes)/100):
            print("Progress: {}/{}".format(i, len(finn_codes)))
        time.sleep(dt)
        print(finn_code)
        try:
            test_ad = fetch_finn_code_url(session, URL_FETCH_AD + finn_code)
            test_ad['id'] = finn_code
            ads.append(test_ad)
        except ConnectionError as e:
            print('Connection error, sleeping...')
            sleep(10)
            print('Continuing')
        except Exception as e:
            print(e)
    return ads


def fetch_finn_code_url(session: requests.Session, finn_url):
    soup_page = BeautifulSoup(session.get(finn_url).text, 'lxml')
    data = {}
    # Handle tabled data
    kvps = {}
    ASSUMED_DL_INDEX = 1
    data_dl_element = soup_page.find_all('dl')[ASSUMED_DL_INDEX]
    for key, value in zip(data_dl_element.find_all('dt'), data_dl_element.find_all('dd')):
        kvps[key.contents[0]] = value.contents[0]
    data['tabled'] = kvps
    
    ## Other data
    # Price
    soup_price = soup_page.findAll('div', {'class': 'h1 mtn r-margin', 'data-automation-id': 'value'})
    price = re.sub(r"\D", "", soup_price[0].contents[0])
    data['price'] = price
    
    # Brand
    sp_title = soup_page.findAll('h1', {'class': 'h1 word-break mbn'})
    title = sp_title[0].contents[0]
    data['title'] = title

    # Where
    soup_place = soup_page.findAll('h2')
    for p in soup_place:
        cmp = re.findall(r'\d\d\d\d \w+', p.contents[0])
        if cmp:
            place = cmp[0]
            break
    else:
        place = ''
    data['place'] = place
    
    return data

# Post-processing

In [2]:
import collections

# Order: flatten > strip_items

def flatten(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, collections.MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

def strip_items(flat_ads):
    stripped = []
    for d in flat_ads:
        stripped.append({
            key: value.strip().replace('\n', ' ')
            for key, value in d.items()
        })
    return stripped


# Handle CSV

In [3]:
import csv
import copy
import datetime

def export_data(filename, data):
    fields = set((key for d in data for key in d.keys()))
    with open(filename, 'w', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fields, lineterminator='\n')
        writer.writeheader()
        for d in data:
            writer.writerow(d)

# Processing

DATETIME_FORMATS = ['%d.%m.%Y', '%m.%Y', '%m %Y', '%Y', '%m/%Y']

def standardize_row(row):
    row_cp = copy.deepcopy(row)
    for key, value in row.items():
        if not value:
            continue
        if '1. gang registrert' in key:
            for date_format in DATETIME_FORMATS:
                try:
                    t = datetime.datetime.strptime(value, date_format)  #'20.11.2011'
                except ValueError:
                    continue
                break
            else:
                print('Could not parse date: %s' % value)
                continue
            row_cp[key] = t.strftime(DATETIME_FORMATS[0])
        elif key in ['tabled_Pris eks omreg', 'tabled_Omregistrering']:
            try:
                row_cp[key] = int(re.sub(r"\D", "", value))
            except ValueError:
                print('Could not parse number: %s' % value)
    return row_cp

def standardize_csv(ifn, ofn):
    with open(ifn, encoding='utf-8') as ifile:
        with open(ofn, 'w', encoding='utf-8') as ofile:
            reader = csv.DictReader(ifile)
            fields = next(reader)
            writer = csv.DictWriter(ofile, fields, lineterminator='\n')
            writer.writeheader()
            for row in reader:
                writer.writerow(standardize_row(row))

# Download

In [6]:
ses = requests.Session()
finn_codes = fetch_finn_codes(ses, 1, 100)
ads = fetch_ads(ses, finn_codes)
flattened_ads = [flatten(d) for d in ads]
stripped_ads = strip_items(flattened_ads)
export_data("output.csv", stripped_ads)

Page 1/100
Page 2/100
Page 3/100
Page 4/100
Page 5/100
Page 6/100
Page 7/100
Page 8/100
Page 9/100
Page 10/100
Page 11/100
Page 12/100
Page 13/100
Page 14/100
Page 15/100
Page 16/100
Page 17/100
Page 18/100
Page 19/100
Page 20/100
Page 21/100
Page 22/100
Page 23/100
Page 24/100
Page 25/100
Page 26/100
Page 27/100
Page 28/100
Page 29/100
Page 30/100
Page 31/100
Page 32/100
Page 33/100
Page 34/100
Page 35/100
Page 36/100
Page 37/100
Page 38/100
Page 39/100
Page 40/100
Page 41/100
Page 42/100
Page 43/100
Page 44/100
Page 45/100
Page 46/100
Page 47/100
Page 48/100
Page 49/100
Page 50/100
Page 51/100
Page 52/100
Page 53/100
Page 54/100
Page 55/100
Page 56/100
Page 57/100
Page 58/100
Page 59/100
Page 60/100
Page 61/100
Page 62/100
Page 63/100
Page 64/100
Page 65/100
Page 66/100
Page 67/100
Page 68/100
Page 69/100
Page 70/100
Page 71/100
Page 72/100
Page 73/100
Page 74/100
Page 75/100
Page 76/100
Page 77/100
Page 78/100
Page 79/100
Page 80/100
Page 81/100
Page 82/100
Page 83/100
Page 84/100
P

In [46]:
export_data("output.csv", stripped_ads)

In [7]:
standardize_csv('output.csv', 's-output.csv')

Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Inklusiv
Could not parse number: Fritatt
Could not parse number: Inklusiv
Could not parse number: Fritatt
Could 