# Define scraping functions

In [28]:
import requests
import time
import re
from bs4 import BeautifulSoup


def fetch_finn_codes(session: requests.Session, first=1, last=5):
    X_FINNKODE = "//*[@id='page-results']/div[1]/div/div/div/div[2]/div/a"
    URL_RESULTS = 'http://m.finn.no/car/used/search.html'
    codes = []
    for i in range(first, last):
        print("Page {}/{}".format(i, last))
        result = session.get(URL_RESULTS, params={'page': i})
        page = result.text
        doc = BeautifulSoup(page, 'lxml')
        ad_ids = [
            element.get('id')
            for element in doc.find_all('a')
            if element.get('class') and 'userhistory' in element.get('class')
        ]
        codes += ad_ids
    return codes


def fetch_ads(session, finn_codes, dt=0.05):
    URL_FETCH_AD = 'http://m.finn.no/car/used/ad.html?finnkode='
    ads = []
    for i, finn_code in enumerate(finn_codes):
        if not (i % (len(finn_codes)/100)):
            print("Progress: {}/{}".format(i, len(finn_codes)))
        time.sleep(dt)
        print(finn_code)
        try:
            test_ad = fetch_finn_code_url(session, URL_FETCH_AD + finn_code)
            test_ad['id'] = finn_code
            ads.append(test_ad)
        except ConnectionError as e:
            print('Connection error, sleeping...')
            sleep(10)
            print('Continuing')
        except Exception as e:
            print(e)
    return ads


def fetch_finn_code_url(session: requests.Session, finn_url):
    soup_page = BeautifulSoup(session.get(finn_url).text, 'lxml')
    data = {}
    # Handle tabled data
    kvps = {}
    data_dl_elements = soup_page.find_all('dl', attrs={'class': 'r-prl mhn multicol col-count1upto640 col-count2upto768 col-count1upto990 col-count2from990'})
    for elem in data_dl_elements:
        for key, value in zip(elem.find_all('dt'), elem.find_all('dd')):
            kvps[key.contents[0]] = value.contents[0]
    data['tabled'] = kvps
    
    ## Other data
    # Price
    soup_price = soup_page.findAll('div', {'class': 'h1 mtn r-margin', 'data-automation-id': 'value'})
    price = re.sub(r"\D", "", soup_price[0].contents[0])
    data['price'] = price
    
    # Brand
    sp_title = soup_page.findAll('h1', {'class': 'h1 word-break mbn'})
    title = sp_title[0].contents[0]
    data['title'] = title

    # Where
    soup_place = soup_page.findAll('h2')
    for p in soup_place:
        cmp = re.findall(r'\d\d\d\d \w+', p.contents[0])
        if cmp:
            place = cmp[0]
            break
    else:
        place = ''
    data['place'] = place
    
    return data

# Post-processing

In [2]:
import collections

# Order: flatten > strip_items

def flatten(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, collections.MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

def strip_items(flat_ads):
    stripped = []
    for d in flat_ads:
        stripped.append({
            key: value.strip().replace('\n', ' ')
            for key, value in d.items()
        })
    return stripped


# Handle CSV

In [3]:
import csv
import copy
import datetime

def export_data(filename, data):
    fields = set((key for d in data for key in d.keys()))
    with open(filename, 'w', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fields, lineterminator='\n')
        writer.writeheader()
        for d in data:
            writer.writerow(d)

# Processing

DATETIME_FORMATS = ['%d.%m.%Y', '%m.%Y', '%m %Y', '%Y', '%m/%Y']

def standardize_row(row):
    row_cp = copy.deepcopy(row)
    for key, value in row.items():
        if not value:
            continue
        if '1. gang registrert' in key:
            for date_format in DATETIME_FORMATS:
                try:
                    t = datetime.datetime.strptime(value, date_format)  #'20.11.2011'
                except ValueError:
                    continue
                break
            else:
                print('Could not parse date: %s' % value)
                continue
            row_cp[key] = t.strftime(DATETIME_FORMATS[0])
        elif key in ['tabled_Pris eks omreg', 'tabled_Omregistrering']:
            try:
                row_cp[key] = int(re.sub(r"\D", "", value))
            except ValueError:
                print('Could not parse number: %s' % value)
    return row_cp

def standardize_csv(ifn, ofn):
    with open(ifn, encoding='utf-8') as ifile:
        with open(ofn, 'w', encoding='utf-8') as ofile:
            reader = csv.DictReader(ifile)
            fields = next(reader)
            writer = csv.DictWriter(ofile, fields, lineterminator='\n')
            writer.writeheader()
            for row in reader:
                writer.writerow(standardize_row(row))

# Download

In [33]:
t0 = time.perf_counter()

ses = requests.Session()
finn_codes = fetch_finn_codes(ses, 400, 800)
ads = fetch_ads(ses, finn_codes)
flattened_ads = [flatten(d) for d in ads]
stripped_ads = strip_items(flattened_ads)

t1 = time.perf_counter()
print("Elapsed time: {}".format(t1-t0))

Page 400/800
Page 401/800
Page 402/800
Page 403/800
Page 404/800
Page 405/800
Page 406/800
Page 407/800
Page 408/800
Page 409/800
Page 410/800
Page 411/800
Page 412/800
Page 413/800
Page 414/800
Page 415/800
Page 416/800
Page 417/800
Page 418/800
Page 419/800
Page 420/800
Page 421/800
Page 422/800
Page 423/800
Page 424/800
Page 425/800
Page 426/800
Page 427/800
Page 428/800
Page 429/800
Page 430/800
Page 431/800
Page 432/800
Page 433/800
Page 434/800
Page 435/800
Page 436/800
Page 437/800
Page 438/800
Page 439/800
Page 440/800
Page 441/800
Page 442/800
Page 443/800
Page 444/800
Page 445/800
Page 446/800
Page 447/800
Page 448/800
Page 449/800
Page 450/800
Page 451/800
Page 452/800
Page 453/800
Page 454/800
Page 455/800
Page 456/800
Page 457/800
Page 458/800
Page 459/800
Page 460/800
Page 461/800
Page 462/800
Page 463/800
Page 464/800
Page 465/800
Page 466/800
Page 467/800
Page 468/800
Page 469/800
Page 470/800
Page 471/800
Page 472/800
Page 473/800
Page 474/800
Page 475/800
Page 476/800

97916516
72211848
97916003
97915476
97916407
97915233
97916301
97916169
97916128
97915859
97915957
97915662
97907170
95696217
97913685
97916016
97915856
97915952
97915777
97915847
97915730
97915164
97913916
97915714
97915744
97915709
97915542
97915670
98609492
97915193
97915118
94834643
97915440
97911895
94031872
97915279
97915296
70863244
97915270
97915232
97915165
97914459
97915161
97915133
97914531
97905591
97914403
97912880
74880095
97915012
97915043
97915053
97914146
94995076
97907356
97914892
97775329
97913640
95381665
92793212
95721447
97913751
97914677
97361957
97914613
97914621
97914628
97913925
96730381
97914567
97503578
97914481
88071568
97914408
97913971
97052386
97914057
97913543
97914356
Progress: 408/20400
98591751
75977320
97914335
97913151
97913511
97913669
97913659
97909691
97914015
97914024
76667582
97913947
97909800
97913796
97913747
96158484
97580487
97912803
97913583
97913516
97913316
97912349
97913439
97909042
97911854
91954246
97913259
97913293
97912844
97907080

97869634
97873589
97874162
97874860
97873477
97734261
97873442
97873524
97763717
97872990
90806075
97858962
92409606
97855045
97856025
97872986
97873004
97873612
95807958
97868839
97860492
97872505
97869713
97860625
97871848
97870902
90725556
78754153
97871202
19131920
97872592
97870282
97871337
97871670
97833388
97871713
97868481
97865600
97870357
97870215
97869532
75326613
97869632
97866835
96240765
97871671
98418364
97826740
97864875
97277045
97284473
95662968
97865275
66514502
97871199
97871188
97869379
94743435
86752672
97869279
97869024
97867054
97867506
97865256
97860835
97870015
97869897
97865236
97867853
65968446
97867096
97732141
97866244
97869424
97866488
97868097
97866460
97864904
97858036
85479522
95855270
97868639
97863819
87686130
97865138
97864130
97866125
97867313
95549159
97865967
87621748
97862168
84401699
97861322
97799581
97849631
97500826
97441420
97863961
92685053
97694696
97859989
97864336
97857040
67582224
97861175
97864222
97864278
97857500
97862885
97859084
8

97787273
68719291
82438082
97787658
95310481
95440885
97780212
97788684
92998158
97784482
97787578
95841188
97786305
97785085
97785253
97767037
97783754
97783931
97780312
97787159
97786452
97786289
97784601
97547013
97738872
97786010
97784560
97783792
97785400
97763245
97780975
97770923
73104824
97782360
89116825
97784645
97778494
97783246
97782887
97783918
97782259
97780371
91223216
97780335
97784398
97781746
97783983
97783570
87839378
97778148
97782143
97765427
94521016
91386689
97782267
97781096
97782991
97729808
95600552
97731396
97782552
97732917
98638487
97782152
97783900
97624961
97781996
97780236
97779765
97781048
97783157
96365772
97780454
97779440
97780580
97327551
73936598
97782015
97782358
97771459
88992040
90836712
97780002
97780277
97765679
97781987
97779916
97589447
93172573
97781850
97779345
97776564
97775370
97781629
97779964
97778068
97781335
97781242
97778597
97780290
97779225
97779950
97778040
97750242
97779198
67726970
97778520
97779982
97776712
97779191
90038053
9

97694353
97694394
93316050
97694230
97694210
97267995
97692980
97693857
97693855
93509727
97693640
97692620
97693595
93376506
97692772
97693350
80881408
97690678
97691666
97693238
97693150
97692971
97658917
97692823
92918847
97692840
97692726
97690414
Progress: 3060/20400
87139513
97692423
95739304
97692404
97692328
91856796
97691503
93097720
97692141
97692094
93668221
97691996
97686991
97691973
97691160
97691765
97667300
89683971
97691675
97691012
97691600
89189862
97691386
97675960
97691158
97690241
97691102
97691063
97689942
97691071
97689858
97687653
97671241
97690776
97690380
94726673
96866370
93237283
97688543
97688828
97686089
97688799
97309955
97688990
97689849
97689829
92690553
97689336
97689584
97689533
97688403
96130662
97686798
97689328
97689317
76111997
97687567
97688721
97688834
97688809
97687585
97688282
97688176
95344885
95741153
97687901
92364421
97687742
97687504
93939923
97685055
97687303
97687317
93837912
97686419
91741199
95739507
97680215
97685133
97686628
9758807

97639975
97639977
97637592
97639827
97639889
97638293
97639769
97639779
97639797
93063917
97639638
97638486
97636322
81720005
97639568
97633308
97639478
97639524
97639413
92896660
97639335
97639357
97639351
97639301
97639240
97639206
97639243
97639184
97637477
97639009
97639055
97639013
97638119
97638992
97638993
97638948
97637037
97637941
91755123
97637669
97638822
97638887
97638811
97638777
97638814
97638685
97638741
98597845
97638811
97638777
97638814
97638685
97638741
97638597
86946115
97638566
97638498
97638450
97638476
97638438
97638372
97632869
97638364
97630667
97638286
93091510
92242872
97638209
97637388
97638153
97638178
97571342
97638102
97638077
97638027
97637962
97637841
97637429
97637886
97637298
97636246
97637696
83577504
94004975
97528392
97637535
97636688
97637435
97637459
97636775
97637408
97637374
97637364
97633987
97637202
97637181
97636329
97637125
98070354
97634052
97637044
97637086
97636101
97636930
97636826
85897462
97635273
97636557
97635561
97636138
97634185
9

86937633
95127786
86650761
97595485
97596657
92798039
97597087
97585462
97597030
97597056
97593379
97595562
97126809
97595679
97596905
97595157
97596889
97596842
97595164
97596365
97592986
89047872
97593750
97596516
97596537
97153080
97594137
97596507
97596487
97595504
97595149
97596451
97596441
97596431
97593452
97596361
97596355
97595174
97596270
97596244
97596192
97594283
97596036
97595982
97464425
97595955
97595887
97595925
97595823
97594681
97595728
97595700
97595727
97594948
97595683
97595606
97595580
94914310
97595500
94661200
97595457
97595423
97595405
Progress: 4896/20400
95640385
97595131
97592564
97593845
97595257
97595212
97443704
97593529
97594191
97594962
97594993
97594914
97593598
97593404
97593369
97591562
95270707
93024490
97594151
97594629
97594613
97594559
97594512
97594479
97594478
97594451
97592713
97594452
97594434
97594107
97083328
97594318
97594179
97594286
81387829
97593249
97594114
84618323
97593769
97594047
97591519
97593512
78112309
95272873
97593508
9759354

97567701
97567706
86111492
97566718
97567572
97567563
97565134
97567519
97567344
97567312
97567322
97567279
97563136
97567237
97567235
80970731
97567127
97567137
97567129
97566953
97567017
96057317
81977159
97566999
97566925
97566429
97566788
97566712
97566694
97566708
97566618
98569403
97563263
74789420
80004763
85236253
97566453
97566072
97566375
97566389
97566371
97520592
97523846
97566335
97566266
97566281
97566254
97566306
97566285
97566279
97566270
97566257
97566258
97566286
97566272
97565933
97565579
97566221
97566241
97566223
97566227
97565542
97511849
95649886
97564077
97565876
97564944
97565825
97565791
97564037
96590544
97564867
97565493
97564771
97565384
97564000
97564912
97564218
97564144
95588408
95939220
97564758
98531167
84213825
97564706
97564687
97563996
97564636
97563587
97563377
97564276
97564089
97563311
97564073
97563898
97562586
97563806
97563739
97563732
97563607
97563657
97563575
97563605
97563499
97563543
97558624
97563458
97563416
97563353
97563362
97563261
8

97515173
97519243
97519181
97519102
97519012
97457573
97517325
97146834
97518955
97515968
97518920
97518872
97518829
97518861
97517919
97512342
97493025
97518725
91192293
97518646
97518564
97508473
97517902
96492731
97518390
97517844
97517755
97518201
89088173
97518105
95127810
97517459
97517475
97517960
97517879
97517903
90164354
97515935
97514749
97517803
97517854
97517524
97517676
97517512
97507305
97517626
97517522
97517489
90781557
97517439
97516039
97495402
97517219
97516388
97517127
97515859
97516994
97517008
97515981
97515501
97516936
97516928
97516512
97516880
97516748
97516838
97516295
97516794
97516796
97516700
97192704
97516561
97365752
97513884
97516490
97516499
97511177
97516413
92967860
97023510
97516367
97515461
97516188
97515834
97516097
97516061
97516095
97515814
97515969
97516023
97501573
97515864
97510532
97515826
87628987
97504662
96112778
89491561
97515385
Progress: 6732/20400
96770788
97515622
97514523
97515440
97515477
97515390
97511484
97515348
97515328
7628740

97465525
93720925
92465466
97472701
93348172
97472399
97466233
97464467
97471440
97269875
97469993
97466634
97471059
Progress: 7548/20400
95207724
97471906
97470118
97472530
75645172
97471004
97470074
97470381
93478804
96875818
97468349
97469751
97471375
97467851
97469795
92796116
97465386
97470709
97378757
97469859
97468434
92382333
97467380
97470274
91931961
97463435
97468853
97379038
97468694
97469693
97460946
97467925
97468364
97465939
97465478
97467247
97466897
93562182
97468105
95339273
97466426
97467170
97468455
97417164
97466223
97465277
97157654
97467010
97467090
97458292
97464962
95663135
97467533
97467493
97467494
97466297
97464952
97450369
97465298
97467261
97465586
97463533
97447088
97401022
97465208
97456971
84772250
97465895
97458901
97464893
97466206
97464087
94426223
97464723
90247567
97465085
78082772
97464583
97464136
97463202
92278736
97465343
75994717
82081743
97463299
97462531
97465093
97465037
97463661
97462878
95673574
97464251
97463478
97462688
97184581
9746372

91844636
97435554
97434167
97435490
97435339
97435319
97435442
97435389
97435272
97435144
97435169
97435077
97435083
97434613
97435043
97435056
97434959
97434949
97434842
97434898
97434863
97434760
97434753
97433709
97434595
97430578
97433858
97434405
94889388
97434397
97434406
97433169
98694661
97434234
97434268
97434230
97434105
97434035
97433969
97431802
97433908
93312519
97432627
97433846
97433565
97433580
97433588
97432148
97433516
97433511
97433504
97433532
97433496
97433489
97433484
97433376
97433363
97433368
97433423
97432313
97433346
85859141
97433223
97430619
97433160
97433081
97407716
97433021
97431116
97432973
95424788
97432784
97432668
97432659
95296770
97432583
97432446
97432492
97432399
97432376
97432409
97432393
97432418
94257509
97432330
97431533
91030648
97432256
97432275
97432205
97432210
97432220
92362654
71340148
97432089
97429327
97432028
97430967
97428943
97431991
97431391
97431965
97431979
97431866
97430830
97431913
97431899
97429210
97430393
97431734
97431672
9

97381899
97381914
97381321
97381841
97381756
97381789
97380790
97381661
97381698
97381686
97381568
97381534
97381561
97381596
97381448
97381469
97380792
97381328
97381369
97381348
97381108
97381293
97381265
97380416
97381171
97379372
97381059
97379923
97380994
95745507
97380934
97380898
97379713
97379571
97380755
97380784
97377308
97379597
97380498
97380445
97380401
97380356
97378426
97380206
97380151
97380173
97374886
97380112
97378724
Progress: 9384/20400
98073955
97380020
97380051
97380031
93281588
97379986
97379927
97378867
97379168
97379735
74565256
97379438
88806132
97378085
97379300
97379305
97379036
97379213
97379137
97379172
97360856
97379114
97379043
97376879
97374547
97180432
97375821
97378966
97378910
97378902
96290329
97378852
97378839
97378765
97377798
97378723
97375798
97377688
97378594
97377685
97378459
97375536
97378349
97378375
97378399
97378394
76899046
92952575
97375955
97008659
97375730
97074007
97220843
97368211
97378073
97378057
97378001
97369032
97376667
9737675

79351159
97329651
97331398
96905981
97330647
97329267
97327477
97330225
89969085
97329469
97011610
97327040
96904464
97330079
97327870
97329257
97327803
98662661
95804350
97328835
97328760
97325709
97324786
97326798
97328018
97329558
94265070
97327215
97326980
97325636
97328702
93590228
97196828
97326699
92635300
97325934
97327142
83579536
97327475
97322134
95575092
97328400
97322976
97324836
97324962
97324730
93351053
97322965
93155933
89448683
97325955
97327685
97327608
97327545
97323407
97327454
96218381
96968271
68766382
96597687
97320791
97325926
82585735
97322944
97325489
92239415
94801287
97283413
78254369
97190428
97324503
97324397
97321910
97322781
97324811
97322470
71935996
93823136
97317694
89984419
97320917
97321423
97321946
93816923
97321444
94568476
97323020
97321896
97317662
86398645
97321418
97322659
97323410
91607126
97309188
97320115
97320729
97319097
97320852
97319946
97319887
97299729
93984069
94901149
97321433
97321862
97315330
97300848
97318940
93316887
97320488
9

97228002
97227705
93302098
97228966
97229312
97229223
96460221
97228417
97226435
97228325
97228476
83873138
97223673
97228181
96980634
97227779
95085961
97150733
97209774
84231954
85817076
95053002
97227003
97027375
97226341
97093945
97225935
93186922
97226200
97225607
97223795
97184896
97167815
98656280
92584731
88648945
85148154
97224054
97223140
97223184
97224102
95275801
95879235
90260587
96533062
97222683
97222408
97223104
97222979
97222499
97166547
97220521
97220755
97219175
97222635
97217676
97221606
97209294
97222026
97220809
97221428
97220810
97205323
97214484
96548640
97220592
97218457
97092493
97219569
97220742
97218897
94211459
97219793
97220879
97220519
97220914
97217036
97220787
97219875
97084437
97069550
97219109
97219061
97219208
Progress: 11220/20400
96770788
97217176
97219689
97219997
97218612
81636674
97216918
97217091
97098761
94617410
97217217
97217580
97218212
97216936
97211850
89652692
94513564
97218539
97215544
97217706
97214664
97214298
94990272
97217876
971551

85325087
97165807
97163054
97165751
97164030
97164237
95715767
89625195
97089429
92138638
97160709
97165693
97163013
97163922
97164543
92955021
81594008
96047443
97164466
97160434
97165354
97165231
97163340
97163568
97163152
97161057
97163186
97163799
97164606
97164451
97162907
94331356
72925830
97163176
90992912
97164094
97164174
97160277
97158973
85950827
97161062
90589563
97161834
94072238
97159972
97156198
97160493
95337670
97162734
97161401
97162610
98577594
91396479
96823064
90710967
97158893
97160173
97154122
97154785
97162095
97159135
87373767
97159001
97156754
97160159
97161564
97154161
97160458
97160364
97161349
97161144
97160088
93945196
97152569
97160929
94968282
97158784
97160752
97154516
97159902
97158970
97159998
79776282
97159019
97155169
97073723
97157309
97157879
77387742
97157478
97158900
94608180
97158777
97157514
97004213
97158770
97157851
81245107
97154799
97156802
97157367
97154278
97354483
97151652
97155521
97155530
97088545
97141719
97154074
97155170
97154150
9

97070456
97070392
97069194
94705820
97070334
97070315
97058411
97070296
97070291
97070300
97070239
97066891
93034854
97070125
97061617
97067947
97068001
97069862
97054355
97067066
96277251
97069666
97069542
97069525
97069442
97069439
97069441
97069435
97069415
97069382
97069375
97069334
97054429
97064438
97069197
97069178
97069180
97069092
97068918
97061965
97068852
97068818
97068768
97065383
97068476
97068509
97068299
97068313
97068173
97067759
97061739
97067172
97067618
90065172
97066707
97066758
97066834
97065331
97058890
97066145
97065548
97065525
97065347
97065312
97065172
97065242
97065252
97065099
97065122
97065114
97065150
98547316
96989005
97064873
97064910
97064880
97064844
97064643
97064657
97064767
96908151
97062056
97064467
97064555
97064536
97060108
97057166
97054828
97054475
97063821
97063794
96781092
97058960
97063696
97063381
97063425
97063289
97063128
94939734
97062926
97062648
97062558
97062275
97055438
97058085
97059098
97061689
97060436
97061725
97059519
97061492
9

96137802
83231430
96576832
97020139
97020126
97019848
97021666
97018931
97019631
97018597
97016431
97017807
97015268
97018439
67044745
97020622
97017241
95072046
97018670
97020438
91944950
91756808
72973577
97017675
97017631
97000452
97019885
96395504
97017875
97018659
94941175
93254675
97017793
97017612
97014807
97018605
96888717
Progress: 13872/20400
97050586
97018057
97015561
97018956
87351791
97018707
97018674
92426551
97018598
97016915
97016150
88018226
97016166
96938445
97016381
97015995
93420211
95053269
90752715
97015093
97016986
95279837
97012823
97014112
84821633
97014263
97014980
97015038
85892620
97016672
97008483
77257022
97014991
97016335
97014722
97016269
97016282
97016295
97016243
90922163
97016209
97014477
97014775
97013641
97014672
93092146
96963109
97015605
91272633
97015595
95648408
96062000
97013630
97008365
94578455
92400289
97012653
97010779
92041762
97011404
97012581
97010021
97010868
93085176
97011850
94198804
97013412
97013040
97011979
97011325
97013445
969971

96978435
96978383
96978254
96978215
96978209
92175541
96977890
96978091
96978074
96977973
96971196
97613159
96977970
96977632
96977628
96977620
93933390
96977220
96976300
96972178
96977367
96975238
96977317
96976270
96977269
96977245
96977237
96977043
96976757
96976372
96800613
94088264
96976648
94134893
96976552
96976432
96976370
96974935
96976289
96107716
88513538
96976141
96976088
96976075
96975999
96975993
96975083
96975945
96975934
96975912
96874077
96975825
95595199
96975792
96975752
96975637
96975660
93058037
94347755
96975502
96975490
96975471
98532360
96975197
96974971
93285415
96975401
96975339
96975287
96974827
96975167
96975134
96975093
96975086
96974813
96975041
96974962
96974989
96974983
96974538
96974904
96974626
96974852
96974809
96974481
96974715
96974770
96974758
96974628
96971933
96974554
96974576
76202332
96973828
96974493
96974522
96974537
96973446
87034700
96974301
96974264
96974231
96974223
96974183
78806566
96974012
95577429
96920606
96973902
75950593
96973140
9

96921892
96921725
96921705
92065788
96921664
96921611
96921576
96921544
96921555
96921518
96921526
96921091
96921419
96921478
96921350
96917896
96921282
96921109
96920012
96921195
94405120
96919654
96919848
96921071
96920986
96920850
96920889
96919209
93252110
96920809
96909603
96919121
96919763
96918159
95613723
96915602
96920568
96920554
94027022
96917988
96920315
96920353
96920368
74423321
96920192
96920167
96919459
96919229
96920008
96919890
96916041
96919838
96919759
96919765
96919790
96919700
96919728
96919719
96918622
96919573
96919577
96919620
96918738
95642603
96902898
96919449
96919464
96908211
96919308
96919339
85343583
96919152
96919179
96918293
96919098
96919090
96915578
96918921
96918942
Progress: 15708/20400
97004085
96918078
96918833
96918754
96917698
96915258
96918687
96913806
95376564
96916538
96916367
96389921
96918167
92005574
91981786
96916106
96918109
96918045
96917228
96913612
96914570
96917990
96917974
96916731
96914409
95833001
96917421
96917645
96917580
969175

96857281
96855132
96858933
96846597
96858296
68916731
96857967
87178840
96858949
95618278
96851634
83908505
96801712
94758559
96853092
96856144
66921737
96855053
96854408
96855760
96088631
96854483
94875281
96855629
96854637
94385811
96855608
96854626
96855570
94508278
96855112
93732385
96856225
96855697
96854597
92351132
96849897
96852817
96855260
96852401
94824413
96850039
96854060
93738069
96852187
96853877
97074007
96743763
81936884
96850788
96850050
96850632
93120912
96852592
96850059
96842944
96852123
89711250
96852212
96849377
86954131
96852050
96852078
96850114
96849535
96850027
81949709
96850547
96847664
72706670
77917121
96847685
96850879
96677008
96848798
96848347
96848157
96840118
96848464
96847951
96846598
96849606
96848634
96840775
96847279
94412843
96846673
81033366
96845070
96839679
76555835
96845982
96848643
96842474
96844544
96841744
96846605
80680124
96846393
96845950
96714646
96841757
82490036
96845540
96840522
96843628
96844609
96842625
95609617
96842609
96842764
9

94469295
96744305
96745554
88718604
75038110
96743184
96741320
96745361
96711434
95641581
94870221
98574694
82751135
95237415
96678381
96743661
96743231
96744635
96742368
96743347
96744509
96743238
96739701
96742564
89971776
84758056
96742748
96741830
96740986
96739625
96739629
92287905
96741685
96684618
96741396
91758851
96740772
92942516
86686629
96741317
96739439
94778207
96739270
93169612
96690244
96740501
96741200
96739783
96668650
92580038
92575382
91410842
94633896
94363170
94930412
96738750
96737712
96739306
96738110
96739433
96738896
96739328
96770788
96739049
96738927
96738611
96738574
94783353
92894781
96738373
67670027
94337565
96737733
96735069
93681340
96738031
96733633
96737902
96736783
96734997
71823312
96723060
96731211
96735506
96732673
96735851
96734923
82573971
95916963
96735779
96609916
86144757
96734537
96734575
96732293
96734357
96734175
96730040
96732579
96733173
96731487
96732623
list index out of range
96730705
96728391
94436435
96731814
96732062
96731981
9673

96669550
96669393
96669322
94933884
96669208
96669157
96664158
96668045
96668932
96668353
96668849
96668831
96668848
96668770
96668768
96665312
96668620
list index out of range
96666805
70368320
71110738
96668351
96668251
96668169
96668228
74485737
96667942
86621218
96667762
96666310
96667637
96667530
Progress: 18360/20400
98249988
96667476
96665387
96667389
96667436
96667337
96665377
96666856
96666661
96666579
96666110
96666258
96666166
96666177
96666040
96666025
96665949
96665886
96665941
96665948
96626411
96665833
96665773
96665774
96665792
96665721
96665648
85913592
96665539
96665523
96664858
93838321
96665016
96664946
96662926
96664943
96664957
96664934
96664048
96664806
96664808
96664828
96664754
96664577
81455645
90920560
96664350
96664296
96663991
96663968
96664016
97736271
96663836
93090276
96660704
96663771
96663781
96131140
96663741
96663600
96663538
96661888
96663466
96663451
96663376
96663359
96662281
96660937
96663137
96663076
96662987
96662623
96661620
96663002
96661730


96581853
96581837
96465664
96581828
96581800
96581794
96578131
96581697
96581731
96581704
96581702
96581660
96581654
96580784
96580485
96581570
96581601
96581513
96580915
96581443
96581448
96581399
96581393
96579901
96581292
96581239
96581191
96578683
96581053
96580334
96581004
96580942
96580872
96580875
96580901
96580876
92532301
96580838
96580790
96580716
96580718
96579232
96580697
96580636
96580629
96579364
96580414
96580455
96580400
96580143
96580106
96578773
96579588
98610493
96580117
96578982
96576530
96579994
96579959
96572456
96579759
96579665
96573299
96579622
96579615
96579513
96579544
96579462
96579334
96579368
96578459
96579263
96579046
96579242
93432869
96579112
96578603
96578910
96578900
96578169
96578812
96578796
96578835
81779487
96578765
96578736
96578779
96578674
96578647
96578614
96578582
96578565
96577929
96576537
96578507
74569809
96574819
96578245
96578369
96578291
85825737
96578280
96577166
96578177
98107164
96578171
96578069
87722918
96577993
96577934
96577776
9

96509401
96479128
96507162
96506905
96509141
96507409
76476489
96508848
96509024
96508350
96508810
96508916
96508850
96242280
96508644
84727998
96507208
96508683
96508722
84106602
96508722
96507833
96506411
96507654
96508465
96507769
96506716
96508323
96508256
96508298
96508200
96508099
91996468
96508023
96507973
96507717
96506124
96507662
96507517
96507480
94297947
96507330
96504281
96507005
96506701
96507109
86928516
96504561
96506821
83840095
96506572
96506177
96506463
96504903
96506276
96505965
96506304
96506117
96505793
96505652
96505660
96505453
96505362
96504452
96504752
96504717
96502643
96504523
96504536
96504393
Progress: 20196/20400
98315009
96503443
96503498
96503755
96503681
96501482
96503553
96503432
96503385
96503057
96503078
75210950
96500100
96500417
96502702
96502734
96502603
96502513
96502447
96502364
96502312
96502252
96502281
96502192
96501313
96501967
96501892
96501886
96501797
96501719
96500855
96370409
96467860
90446622
96501531
96383078
96501392
96501279
965011

In [34]:
export_data("output-400-800.csv", stripped_ads)

In [38]:
standardize_csv('output-100.csv', 's-output-100.csv')

Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Ikke oppgitt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse date: 9/6-2011
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Ikke oppgitt
Could not parse date: Mars 2012
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse number: Fritatt
Could not parse date: Juni 2012

# Spare

In [11]:
soup_page = BeautifulSoup(ses.get('http://m.finn.no/car/used/ad.html?finnkode=98543643').text, 'lxml')

In [16]:
data_dl_element = soup_page.find_all('dl')[2]
for key, value in zip(data_dl_element.find_all('dt'), data_dl_element.find_all('dd')):
    print(key, value)

<dt data-automation-id="key">Årsmodell</dt> <dd data-automation-id="value">2001</dd>
<dt data-automation-id="key">1. gang registrert</dt> <dd data-automation-id="value">30.04.2001</dd>
<dt data-automation-id="key">Km.stand</dt> <dd data-automation-id="value">227 500 km</dd>
<dt data-automation-id="key">Farge</dt> <dd data-automation-id="value">Sølv</dd>
<dt data-automation-id="key">Girkasse</dt> <dd data-automation-id="value">Manuell</dd>
<dt data-automation-id="key">Hjuldrift</dt> <dd data-automation-id="value">Forhjulsdrift</dd>
<dt data-automation-id="key">Drivstoff</dt> <dd data-automation-id="value">Bensin</dd>
<dt data-automation-id="key">Effekt</dt> <dd data-automation-id="value">105 Hk</dd>
<dt data-automation-id="key">Sylindervolum</dt> <dd data-automation-id="value">1,6 l</dd>
<dt data-automation-id="key">Vekt</dt> <dd data-automation-id="value">1 215 kg</dd>
<dt data-automation-id="key">CO2 utslipp</dt> <dd data-automation-id="value">168 g/km</dd>
<dt data-automation-id="key

In [19]:
b = soup_page.find_all('dl', attrs={'class': 'r-prl mhn multicol col-count1upto640 col-count2upto768 col-count1upto990 col-count2from990'})