In [191]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np
from datetime import datetime
from itertools import product
import json
import multiprocessing

# this is just a file with a worker function for multiprocessing
# (otherwise multiprocessing doesn't work in Jupyter on Windows)
import worker 

In [192]:
# brand names have been coppied from auto.ru manually and stored in brands.xlsx
df_brands = pd.read_excel('brands.xlsx')
df_brands

Unnamed: 0,Brands,Count
0,AC,2
1,AMC,2
2,Acura,46
3,Alfa Romeo,27
4,Alpina,2
...,...,...
120,ЛуАЗ,11
121,Москвич,101
122,СМЗ,1
123,ТагАЗ,34


In [193]:
# let's remove rare (<400) and Russian/Soviet brands
df_brands = df_brands[(df_brands['Count'] > 400) & ~(df_brands['Brands'].isin(['LADA (ВАЗ)', 'ГАЗ', 'УАЗ']))]

In [194]:
df_brands

Unnamed: 0,Brands,Count
8,Audi,2750
9,BMW,5283
19,Chevrolet,1352
21,Citroen,692
39,Ford,2017
51,Honda,574
53,Hyundai,3470
54,Infiniti,599
59,Jeep,401
60,Kia,3850


In [231]:
def crawl_auto(brands_lst, year_from=1980, year_to=2021, fr=1, to=10000000, radius=200):
    """
    This function crawls auto.ru. Idea for this function was 
    taken from https://github.com/DarkLabel1/YouTube/blob/master/Auto_ru.py
    
    After that it was slightly modified to fit the requirements of kaggle competition
    
    Arguments:
        brands - a list of brands to process
        year_from and year_to - range of car production years
        fr and to - used for testing (you can slice year_brand list to make the output longer or shorter
        if you need to test something)
        radius - radius from Moscow
    """
    # capitalize brand names
    brands = brands_lst.copy()
    brands = [b.upper() for b in brands]
    
    # all the years to consider
    year_range = list(np.arange(year_from, year_to))
    
    # pairs of year and brand to iterate over them
    year_brand = list(product(year_range, brands))
    
    # these 2 won't change
    URL = 'https://auto.ru/-/ajax/desktop/listing/' #URL for the post request

    # header for the post request
    HEADERS = {
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3',
        'Connection': 'keep-alive',
        'Content-Length': '137',
        'content-type': 'application/json',
        'Cookie': 'autoru_gdpr=1; _csrf_token=1c0ed592ec162073ac34d79ce511f0e50d195f763abd8c24; autoru_sid=a%3Ag5e3b198b299o5jhpv6nlk0ro4daqbpf.fa3630dbc880ea80147c661111fb3270%7C1580931467355.604800.8HnYnADZ6dSuzP1gctE0Fw.cd59AHgDSjoJxSYHCHfDUoj-f2orbR5pKj6U0ddu1G4; autoruuid=g5e3b198b299o5jhpv6nlk0ro4daqbpf.fa3630dbc880ea80147c661111fb3270; suid=48a075680eac323f3f9ad5304157467a.bc50c5bde34519f174ccdba0bd791787; from_lifetime=1580933172327; from=yandex; X-Vertis-DC=myt; crookie=bp+bI7U7P7sm6q0mpUwAgWZrbzx3jePMKp8OPHqMwu9FdPseXCTs3bUqyAjp1fRRTDJ9Z5RZEdQLKToDLIpc7dWxb90=; cmtchd=MTU4MDkzMTQ3MjU0NQ==; yandexuid=1758388111580931457; bltsr=1; navigation_promo_seen-recalls=true',
        'Host': 'auto.ru',
        'origin': 'https://auto.ru',
        'Referer': 'https://auto.ru/ryazan/cars/mercedes/all/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
        'x-client-app-version': '202002.03.092255',
        'x-client-date': '1580933207763',
        'x-csrf-token': '1c0ed592ec162073ac34d79ce511f0e50d195f763abd8c24',
        'x-page-request-id': '60142cd4f0c0edf51f96fd0134c6f02a',
        'x-requested-with': 'fetch'
    }    
    
    result = []
    
    for yb in year_brand[fr:to]:  
        i = 1 # initializing i for pagination 
        l = 1 # initializing l that will be replaced by len(data) below
        while l > 0:
            # Post request parameters are changed within the loop
            PARAMS = {
                 'catalog_filter' : [{"mark": yb[1]}],
                 'section': "all",
                 'category': "cars",
                 'sort': "fresh_relevance_1-desc",
                 'page': i,
                 'geo_radius' : str(radius),
                 'year_from' : str(yb[0]),
                 'year_to' : str(yb[0]),
                 'geo_id' : [213]
                }

            i+=1
            
            response = requests.post(URL, json=PARAMS, headers=HEADERS) 
            try:
                data = response.json()['offers']
            except: 
                print(f'Failed for {yb[1]}, year {yb[0]} page {i} - let us go on!')
                
            l = len(data)
            if l > 0:
                print(f'{yb[1]}, year {yb[0]} : {l} entries')
                for o in data:
                    result.append(o)
    
    print('Crawling done!!!')
    return result;   

In [232]:
def save_to_file(j):
    """
    Saves a json string to a file
    """
    jsonString = json.dumps(j, indent=4)
    with open("data.json", "w") as f:
        f.write(jsonString)
        f.close()

In [233]:
j = crawl_auto(brands_lst=list(df_brands['Brands']), year_from=1980, year_to=2022)
save_to_file(j)

FORD, year 1980 : 1 entries
MITSUBISHI, year 1980 : 1 entries
VOLKSWAGEN, year 1980 : 1 entries
VOLVO, year 1980 : 1 entries
AUDI, year 1981 : 3 entries
BMW, year 1981 : 2 entries
CHEVROLET, year 1981 : 2 entries
OPEL, year 1981 : 1 entries
TOYOTA, year 1981 : 2 entries
BMW, year 1982 : 4 entries
CHEVROLET, year 1982 : 1 entries
TOYOTA, year 1982 : 4 entries
VOLKSWAGEN, year 1982 : 1 entries
AUDI, year 1983 : 3 entries
BMW, year 1983 : 4 entries
FORD, year 1983 : 1 entries
PORSCHE, year 1983 : 1 entries
TOYOTA, year 1983 : 2 entries
VOLKSWAGEN, year 1983 : 3 entries
VOLVO, year 1983 : 2 entries
AUDI, year 1984 : 8 entries
BMW, year 1984 : 2 entries
CHEVROLET, year 1984 : 3 entries
FORD, year 1984 : 1 entries
MAZDA, year 1984 : 2 entries
MITSUBISHI, year 1984 : 1 entries
OPEL, year 1984 : 1 entries
PORSCHE, year 1984 : 2 entries
SKODA, year 1984 : 1 entries
TOYOTA, year 1984 : 3 entries
VOLKSWAGEN, year 1984 : 1 entries
VOLVO, year 1984 : 1 entries
AUDI, year 1985 : 4 entries
BMW, year 

VOLKSWAGEN, year 1998 : 38 entries
VOLKSWAGEN, year 1998 : 22 entries
VOLVO, year 1998 : 17 entries
AUDI, year 1999 : 37 entries
AUDI, year 1999 : 3 entries
BMW, year 1999 : 38 entries
CHEVROLET, year 1999 : 3 entries
CITROEN, year 1999 : 1 entries
FORD, year 1999 : 12 entries
HONDA, year 1999 : 19 entries
HYUNDAI, year 1999 : 10 entries
JEEP, year 1999 : 10 entries
KIA, year 1999 : 2 entries
LEXUS, year 1999 : 7 entries
MAZDA, year 1999 : 14 entries
MITSUBISHI, year 1999 : 38 entries
MITSUBISHI, year 1999 : 13 entries
NISSAN, year 1999 : 37 entries
NISSAN, year 1999 : 12 entries
OPEL, year 1999 : 35 entries
PEUGEOT, year 1999 : 11 entries
RENAULT, year 1999 : 6 entries
SKODA, year 1999 : 6 entries
SUBARU, year 1999 : 19 entries
SUZUKI, year 1999 : 9 entries
TOYOTA, year 1999 : 37 entries
TOYOTA, year 1999 : 24 entries
VOLKSWAGEN, year 1999 : 37 entries
VOLKSWAGEN, year 1999 : 32 entries
VOLVO, year 1999 : 6 entries
AUDI, year 2000 : 37 entries
AUDI, year 2000 : 2 entries
BMW, year 200

HONDA, year 2006 : 32 entries
HYUNDAI, year 2006 : 38 entries
HYUNDAI, year 2006 : 37 entries
HYUNDAI, year 2006 : 37 entries
HYUNDAI, year 2006 : 19 entries
INFINITI, year 2006 : 14 entries
JEEP, year 2006 : 9 entries
KIA, year 2006 : 38 entries
KIA, year 2006 : 37 entries
KIA, year 2006 : 34 entries
LEXUS, year 2006 : 26 entries
MINI, year 2006 : 2 entries
MAZDA, year 2006 : 37 entries
MAZDA, year 2006 : 37 entries
MAZDA, year 2006 : 24 entries
MITSUBISHI, year 2006 : 38 entries
MITSUBISHI, year 2006 : 37 entries
MITSUBISHI, year 2006 : 37 entries
MITSUBISHI, year 2006 : 25 entries
NISSAN, year 2006 : 38 entries
NISSAN, year 2006 : 37 entries
NISSAN, year 2006 : 37 entries
NISSAN, year 2006 : 36 entries
OPEL, year 2006 : 38 entries
OPEL, year 2006 : 32 entries
PEUGEOT, year 2006 : 37 entries
PEUGEOT, year 2006 : 11 entries
PORSCHE, year 2006 : 7 entries
RENAULT, year 2006 : 37 entries
RENAULT, year 2006 : 37 entries
RENAULT, year 2006 : 9 entries
SKODA, year 2006 : 29 entries
SUBARU,

KeyError: 'offers'

In [227]:
len(j)

2527