In [1]:
import time
import random

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('dataset_fixed-3.csv', sep=',')

In [3]:
!mkdir -p 2gis

### Обкачиваем 2GIS

In [4]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementNotVisibleException

In [5]:
driver = webdriver.Chrome()
driver.implicitly_wait(5)
driver.get("https://2gis.ru/moscow")

In [None]:
address_prev = None

next_id = None

for r in df.itertuples():
    if next_id is not None and r.id != next_id:
        continue
        
    next_id = None
    
    if r.address != address_prev:
        try:
            driver.find_element_by_name("search[query]").click()
        except ElementNotVisibleException as e:
            driver.get("https://2gis.ru/moscow")
            driver.find_element_by_name("search[query]").click()
            
        driver.find_element_by_name("search[query]").clear()
        # driver.find_element_by_name("search[query]").send_keys(r.address)
        driver.find_element_by_name("search[query]").send_keys(f'{r.lat} {r.lng}')
        driver.find_element_by_name("search[query]").send_keys(Keys.ENTER)
    
        time.sleep(2 + random.random() * 0.5)
    
    with open(f'2gis/{r.id}.html', mode='w') as f_name:
        f_name.write(driver.page_source)
        
    address_prev = r.address

In [13]:
driver.close()

### Обрабатываем скачанные странички

In [1]:
import os
import re

from operator import attrgetter, itemgetter

from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

In [2]:
class UknownCaseException(Exception):
    pass

In [3]:
def process_building(soup):
    elem = soup.find('div', class_='_purpose_building')
    if elem:
        elem = elem.text.rsplit(',', 1)
        if len(elem) == 2:
            type_, floors = elem[0], int(re.search('\d+', elem[1]).group(0))
        else:
            type_, floors = '', int(re.search('\d+', elem[0]).group(0))
    else:
        raise UknownCaseException('Unknown case')
    
    elem = soup.find('section', class_='card__firmsInBuilding')
    if elem:
        elem = elem.find('div', class_='cardInfrastructureItem__counter')
        n_orgs = int(elem.text)
    else:
        n_orgs = 0
    
    if n_orgs > 0:
        elem = soup.find_all('li', class_='cardInfrastructureFirmsList__listItem')
        name = '\n'.join(map(attrgetter('text'), elem))
    else:
        name = ''
    
    rating, rating_style = -1, None

    elem = soup.find('div', class_='customRating__stars')
    if elem:
        rating = int(next(filter(lambda s: s.startswith('_value'), elem.attrs['class'])).split('_')[-1])
        rating_style = 'customRating'
        
    elem = soup.find('div', class_='rating')
    if elem:
        rating = int(next(filter(lambda s: s.startswith('_value'), elem.attrs['class'])).split('_')[-1])
        rating_style = next(filter(lambda s: s.startswith('_style'), elem.attrs['class'])).split('_')[-1]
    
    res = {
        'name': name,
        'type': type_,
        'floors': floors,
        'norgs': n_orgs, 
    }
    
    if rating > 0 and rating_style:
        res.update({'rating': rating, 'rating_style': rating_style})
        
    return res

In [4]:
def process_organisation(soup):    
    name = soup.find('h1', class_='cardHeader__headerNameText').text
    
    elem = soup.find('div', class_='cardHeader__headerDescriptionText')
    type_ = elem.text if elem else ''
    
    elem = soup.find('div', class_='_purpose_shortbuildinginfo')
    if elem:
        floors = int(re.search('\d+', elem.text).group(0))
    else:
        floors = 0
    
    elem = soup.find('div', class_='_purpose_building')
    if elem:
        raise UknownCaseException()
        
    elem = soup.find('section', class_='card__firmsInBuilding')
    if elem:
        elem = elem.find('div', class_='cardInfrastructureItem__counter')
        n_orgs = int(elem.text)
    else:
        n_orgs = 0
        
    rating, rating_style = -1, None
    
    elem = soup.find('div', class_='customRating__stars')
    if elem:
        rating = int(next(filter(lambda s: s.startswith('_value'), elem.attrs['class'])).split('_')[-1])
        rating_style = 'customRating'
        
    elem = soup.find('div', class_='rating')
    if elem:
        rating = int(next(filter(lambda s: s.startswith('_value'), elem.attrs['class'])).split('_')[-1])
        rating_style = next(filter(lambda s: s.startswith('_style'), elem.attrs['class'])).split('_')[-1]
        
    res = {
        'isorg': 1,
        'name': name,
        'type': type_,
        'floors': floors,
        'norgs': n_orgs, 
    }
    
    if rating > 0 and rating_style:
        res.update({'rating': rating, 'rating_style': rating_style})
        
    return res

In [5]:
def process_media_content(soup):
    name = soup.find('h1', class_='mediaCardHeader__cardHeaderName').text
    
    elem = soup.find('span', class_='mediaAddress__linkCount')
    n_orgs = int(elem.text) if elem else 0
    
    rating, rating_style = -1, None
    
    elem = soup.find('div', class_='customRating__stars')
    if elem:
        rating = int(next(filter(lambda s: s.startswith('_value'), elem.attrs['class'])).split('_')[-1])
        rating_style = 'customRating'
        
    elem = soup.find('div', class_='rating')
    if elem:
        rating = int(next(filter(lambda s: s.startswith('_value'), elem.attrs['class'])).split('_')[-1])
        rating_style = next(filter(lambda s: s.startswith('_style'), elem.attrs['class'])).split('_')[-1]
    
    res = {
        'isorg': 1,
        'name': name,
        'norgs': n_orgs,
    }
    
    if rating > 0 and rating_style:
        res.update({'rating': rating, 'rating_style': rating_style})
        
    return res

In [6]:
def process_other_case(soup):
    name = soup.find('h1', class_='card__name').text
    
    elem = soup.find('div', class_='_purpose_building')
    if elem:
        elem = elem.text.rsplit(',', 1)
        if len(elem) == 2:
            type_, floors = elem[0], int(re.search('\d+', elem[1]).group(0))
        else:
            type_, floors = '', int(re.search('\d+', elem[0]).group(0))
    else:
        raise UknownCaseException('Unknown case')
        
    elem = soup.find('section', class_='card__firmsInBuilding')
    if elem:
        elem = elem.find('div', class_='cardInfrastructureItem__counter')
        n_orgs = int(elem.text)
    else:
        n_orgs = 0
        
    rating, rating_style = -1, None

    elem = soup.find('div', class_='customRating__stars')
    if elem:
        rating = int(next(filter(lambda s: s.startswith('_value'), elem.attrs['class'])).split('_')[-1])
        rating_style = 'customRating'
        
    elem = soup.find('div', class_='rating')
    if elem:
        rating = int(next(filter(lambda s: s.startswith('_value'), elem.attrs['class'])).split('_')[-1])
        rating_style = next(filter(lambda s: s.startswith('_style'), elem.attrs['class'])).split('_')[-1]
    
    res = {
        'name': name,
        'type': type_,
        'floors': floors,
        'norgs': n_orgs, 
    }
    
    if rating > 0 and rating_style:
        res.update({'rating': rating, 'rating_style': rating_style})
        
    return res

In [7]:
from multiprocessing import Pool
from contextlib import suppress

In [8]:
def process_dumpfile(filename, root):
    id_, _ = os.path.splitext(filename)
        
    filename = os.path.join(root, filename)
        
    with open(filename) as f_name:
        soup = BeautifulSoup(f_name, 'html.parser')
    
    res = {'id': int(id_)}
    
    org = soup.find('div', class_='cardHeader__headerName')
    bld = soup.find('span', class_='card__namePart')
    media = soup.find('div', class_='mediaCardHeader__card')
    
    r = {}
    
    try:    
        if org is not None:
            r = process_organisation(soup)
        elif bld is not None:
            r = process_building(soup)
        elif media is not None:
            r = process_media_content(soup)
        else:
            with suppress(UknownCaseException):
                r = process_other_case(soup)
    except Exception as e:
        print(id_)
        raise e
    
    res.update(r)
    
    return res

In [9]:
for root, dirs, filenames in os.walk('2gis'):
    break

In [10]:
process_dumpfile(filenames[0], root)

{'id': 2481,
 'isorg': 1,
 'name': 'Измайлово',
 'type': 'Гостиничный комплекс',
 'floors': 31,
 'norgs': 104,
 'rating': 90,
 'rating_style': 'customRating'}

In [11]:
%%time

from functools import partial
    
pool = Pool(10)
df_names = pool.map(partial(process_dumpfile, root=root), filenames)

CPU times: user 553 ms, sys: 301 ms, total: 855 ms
Wall time: 14min 55s


In [12]:
df_names = pd.DataFrame(df_names)
df_names.iloc[:10]

Unnamed: 0,floors,id,isorg,name,norgs,rating,rating_style,type
0,31.0,2481,1.0,Измайлово,104.0,90.0,customRating,Гостиничный комплекс
1,9.0,2397,,"Малинэль, парикмахерский магазин, ИП Дмитриева...",8.0,100.0,smallBlue,Жилой дом с административными помещениями
2,9.0,6436,,"Банк ВТБ, ПАО\nБанкомат, Банк ВТБ, ПАО\nСвязис...",13.0,,,Жилой дом с административными помещениями
3,,3314,,,,,,
4,,3337,,,,,,
5,2.0,4601,1.0,Абсолют,14.0,40.0,customRating,Торговая группа
6,2.0,1470,1.0,М5 Молл,183.0,90.0,customRating,Торгово-развлекательный центр
7,,4921,,,,,,
8,,1454,,,,,,
9,5.0,7825,1.0,Варшавская плаза,51.0,100.0,customRating,Бизнес-центр


In [16]:
mask = df_names['rating_style'] == 'minicard'
df_names.loc[mask, ['rating', 'rating_style']] = np.nan

In [20]:
df_names['rating_style'].unique()

array(['customRating', 'smallBlue', nan, 'mediumGray'], dtype=object)

In [21]:
df_names.fillna({'floors': df_names['floors'].median(),
                 'norgs': df_names['norgs'].median(),
                 'name': '', 'type': '',
                 'rating': 0.0, 'rating_style': 'noRating'}, inplace=True)
df_names.head()

Unnamed: 0,floors,id,isorg,name,norgs,rating,rating_style,type
0,31.0,2481,1.0,Измайлово,104.0,90.0,customRating,Гостиничный комплекс
1,9.0,2397,,"Малинэль, парикмахерский магазин, ИП Дмитриева...",8.0,100.0,smallBlue,Жилой дом с административными помещениями
2,9.0,6436,,"Банк ВТБ, ПАО\nБанкомат, Банк ВТБ, ПАО\nСвязис...",13.0,0.0,norating,Жилой дом с административными помещениями
3,4.0,3314,,,14.0,0.0,norating,
4,4.0,3337,,,14.0,0.0,norating,


In [24]:
df_names['rating_style'].unique()

array(['customRating', 'smallBlue', 'noRating', 'mediumGray'], dtype=object)

In [33]:
df_names['rating_style'] = pd.Categorical(
    df_names['rating_style'],
    categories=['noRating', 'smallBlue', 'mediumGray', 'customRating'],
    ordered=True).codes

In [34]:
def process_name(name):
    name = name.split('\n')
    name = filter(lambda s: 'Банкомат, ' not in s, name)
    name = map(lambda s: 'ст. метро' if 'ст. метро' in s else s, name)
    return '\n'.join(name)

In [35]:
def process_type(s):
    return re.sub('(?:, |\b)?([гГ]|Рп|[пП]ос)\. (?:.*?)$', '', s)

In [36]:
df_names['name'] = df_names['name'].map(process_name)
df_names['type'] = df_names['type'].map(process_type)
df_names['floors'] = df_names['floors'].astype(int)
df_names['norgs'] = df_names['norgs'].astype(int)

In [37]:
from collections import Counter
for k, f in Counter(df_names['type'][df_names['name'] == '']).most_common():
    print(f"{k:<45s} {f}")

                                              2131
Жилой дом                                     96
Частный дом                                   60
Административное здание                       54
Жилой дом с административными помещениями     32
Малоэтажный жилой дом                         10
Хозяйственный корпус                          9
Производственный корпус                       8
Сооружение                                    4
Ремонтируемое здание                          4
Проходная, КПП                                3
Медицинское учреждение                        3
Коттедж                                       2
Магазин                                       2
Склад                                         2
Строящееся административное здание            1
Общежитие                                     1
Таунхаус                                      1
Гараж                                         1


In [38]:
typeset = set(df_names['type'][df_names['name'] == ''].unique())
typeset.difference_update({'Медицинское учреждение', 'Магазин', 'Проходная, КПП'})

In [39]:
df_names['comment'] = df_names['name']

mask = ~df_names['type'].isin(typeset)
df_names.loc[mask, 'comment'] += '\n' + df_names.loc[mask, 'type']

df_names.head()

Unnamed: 0,floors,id,isorg,name,norgs,rating,rating_style,type,comment
0,31,2481,1.0,Измайлово,104,90.0,3,Гостиничный комплекс,Измайлово\nГостиничный комплекс
1,9,2397,,"Малинэль, парикмахерский магазин, ИП Дмитриева...",8,100.0,1,Жилой дом с административными помещениями,"Малинэль, парикмахерский магазин, ИП Дмитриева..."
2,9,6436,,"Банк ВТБ, ПАО\nСвязист, телекоммуникационная к...",13,0.0,0,Жилой дом с административными помещениями,"Банк ВТБ, ПАО\nСвязист, телекоммуникационная к..."
3,4,3314,,,14,0.0,0,,
4,4,3337,,,14,0.0,0,,


In [40]:
df_names.drop(columns=['isorg'], inplace=True)
df_names.to_csv('dataset_2gis.csv', sep=',', index=False)