In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('all_working_domains.csv')
urls = df['0']
urls[:5]

0    https://1000sotok.ru
1         https://10rg.ru
2      https://12sever.ru
3     https://1470umto.ru
4        https://1lion.ru
Name: 0, dtype: object

In [4]:
PHONE_PATTERNS = [
    r'\(\d{4}\) \d{3}-\d{3}',
    r'\(\d{4}\) \d{2}-\d{2}-\d{2}',
    
    # 1 3 3 (2 2 | 4)
    r'\(?\+?[78][\s-]\(?\d{3}\)?[\s-]?\d{3}[\s-]?\d{2}[\s-]?\d{2}',

    # 1|0 4 2 (2 2 | 4)
    r'\+\d?[\s-]?\(?\d{4}\)?[\s-]\d{2}[\s-]\d{2}[\s-]?\d{2}',
]

possible_subpages = ['about', 'contacts', 'Контакты', 'О компании', 'company', 'kompaniya', 'o-komanii', 'contact']

In [5]:
from bs4 import BeautifulSoup as bs
import requests
import re

def find_inn(s):
    ans = re.findall(r'[^a-zA-Z0-9\'\"\?\!]\d{10}[^a-zA-Z0-9\'\"\?\!]', s)
    if ans:
        ans = [e.strip() for e in ans]
        return ans
    else:
        return None

def find_ogrn(s):
    ans = tuple(re.findall(r'[^a-zA-Z0-9\'\"\?\!]\d{13}[^a-zA-Z0-9\'\"\?\!]', s))
    if ans:
        ans = [e.strip() for e in ans]
        return ans
    else:
        return None

In [6]:
def extract_address(s):
    pattern = r'\г\.?\s[А-Я][а-я]+.*'
    ans = list(re.findall(pattern, s))
    if ans:
        return list([e.split()[:10] for e in re.findall(pattern, s)])
    else:
        return []

extract_address('Московская область, г. Домодедово, Каширское шоссе, 7, Кабинеты 405(А), 405(Б)')

[['г.',
  'Домодедово,',
  'Каширское',
  'шоссе,',
  '7,',
  'Кабинеты',
  '405(А),',
  '405(Б)']]

In [7]:
def extract_mail(line):
    match = re.search(r'[\w.+-]+@[\w-]+\.[\w.-]+', line)
    return match.group(0)

def get_phone_numbers(line):
    global PHONE_PATTERNS
    for pattern in PHONE_PATTERNS:
        if len(re.findall(pattern, line)) > 0:
            return np.unique(re.findall(pattern, line))
        
def get_mails(text):
    if '@' in text:
        try:
            return extract_mail(text)
        except:
            pass
        
def standart_phone(t):
    if t != None:
        res = []
        for p in t:
            r = ''
            for c in p:
                if c.isdigit():
                    r+=c
            res.append(r) 
        return np.unique(res)
    else:
        return None
        
urls = pd.read_csv('all_working_domains.csv', header=None)
urls = urls[urls.columns[0]]
urls = urls[1:]

In [8]:
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def parse_one_url(url):
    
    session = requests.Session()
    retry = Retry(connect=2, backoff_factor=0)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    try:
        req = session.get(url)
        soup = bs(req.text, 'html.parser')
        texts = soup.find_all(text=True)
        texts = u" ".join(t.strip() for t in texts)
        l = [url,  get_phone_numbers(texts), get_mails(texts), find_inn(texts), find_ogrn(texts), extract_address(texts)]
        global possible_subpages
        for sub in possible_subpages:
                r = session.get(url)
                soup = bs(r.text, 'html.parser')
                ts = soup.find_all(text=True)
                ts = u" ".join(t.strip() for t in ts)
                if len(l[1]) < 1:
                    l[1] = get_phone_numbers(ts)
                if l[2] == None:
                    l[2] = get_mails(ts)
            
                if not l[3]:
                    l[3] = find_inn(ts)
                    if not l[4]:
                        l[4] = find_ogrn(ts)
                    
                l[5].append(extract_address(ts))
                l[1] = np.unique([''.join(re.findall(r'\d+', e)) for e in l[1]])
        return l
    except:
        return [None for _ in range(6)]


In [9]:
parse_one_url('https://kraszem.ru')

['https://kraszem.ru',
 array(['74957252603', '74959692929'], dtype='<U11'),
 'info@kraszem.ru',
 None,
 ['_1676460961150{', '_1676460961150:'],
 [['г.',
   'Москва,',
   'пр-т',
   'Андропова,',
   'д.',
   '22,',
   '16',
   'этаж',
   'info@kraszem.ru',
   'Оставить'],
  [['г.',
    'Москва,',
    'пр-т',
    'Андропова,',
    'д.',
    '22,',
    '16',
    'этаж',
    'info@kraszem.ru',
    'Оставить']],
  [['г.',
    'Москва,',
    'пр-т',
    'Андропова,',
    'д.',
    '22,',
    '16',
    'этаж',
    'info@kraszem.ru',
    'Оставить']],
  [['г.',
    'Москва,',
    'пр-т',
    'Андропова,',
    'д.',
    '22,',
    '16',
    'этаж',
    'info@kraszem.ru',
    'Оставить']],
  [['г.',
    'Москва,',
    'пр-т',
    'Андропова,',
    'д.',
    '22,',
    '16',
    'этаж',
    'info@kraszem.ru',
    'Оставить']],
  [['г.',
    'Москва,',
    'пр-т',
    'Андропова,',
    'д.',
    '22,',
    '16',
    'этаж',
    'info@kraszem.ru',
    'Оставить']],
  [['г.',
    'Москва,',
    'пр

In [10]:
from p_tqdm import p_map

num_cpus = 16

results = p_map(parse_one_url, urls[:100], **{"num_cpus": num_cpus})

100%|██████████| 100/100 [00:57<00:00,  1.74it/s]


In [11]:
results

[['https://1000sotok.ru',
  array(['88125150070', '89013021152', '89013021312', '89013038264',
         '89013052188', '89013075315'], dtype='<U11'),
  None,
  None,
  None,
  [[], [], [], [], [], [], [], []]],
 [None, None, None, None, None, None],
 ['https://12sever.ru',
  array(['8362335588', '8362458854'], dtype='<U10'),
  'seversg@bk.ru',
  ['1681765606', '1681765606', '1681765606'],
  ['_1535552722157{',
   '_1538396594893{',
   '_1490364209546{',
   '_1532518586805{',
   '_1530521315572{',
   '_1530521407191{',
   '_1530521464885{',
   '_1536305760231{'],
  [['г.',
    'Йошкар-Ола,',
    'ул.',
    'Чернякова,',
    'поз.',
    '25',
    '(1',
    'этап',
    'строительства)',
    'Объект:'],
   [['г.',
     'Йошкар-Ола,',
     'ул.',
     'Чернякова,',
     'поз.',
     '25',
     '(1',
     'этап',
     'строительства)',
     'Объект:']],
   [['г.',
     'Йошкар-Ола,',
     'ул.',
     'Чернякова,',
     'поз.',
     '25',
     '(1',
     'этап',
     'строительства)',
     'О

In [13]:
df = pd.DataFrame(results, columns =['url', 'tel', 'mail', 'inn', 'ogrn', 'address'], dtype = float)
df

  df = pd.DataFrame(results, columns =['url', 'tel', 'mail', 'inn', 'ogrn', 'address'], dtype = float)


Unnamed: 0,url,tel,mail,inn,ogrn,address
0,https://1000sotok.ru,"[88125150070, 89013021152, 89013021312, 890130...",,,,"[[], [], [], [], [], [], [], []]"
1,,,,,,
2,https://12sever.ru,"[8362335588, 8362458854]",seversg@bk.ru,"[1681765606, 1681765606, 1681765606]","[_1535552722157{, _1538396594893{, _1490364209...","[[г., Йошкар-Ола,, ул., Чернякова,, поз., 25, ..."
3,https://1470umto.ru,[3022217719],info@1470umto.ru,,,"[[г., Москва,, Очаковское, шоссе,, д., 28, с.1..."
4,https://1lion.ru,"[79788046535, 89788580000]",,,,"[[г., Алушта, ул., Вл., Хромых,, 29,, 2, эт., ..."
...,...,...,...,...,...,...
95,https://atlant-complex.ru,[4822631010],,,"[.8355645678836,]","[[г., Тверь,, ул., 15, лет, Октября,, д., 52,,..."
96,https://atlas-realty.ru,"[78624440312, 88002010303, 88624440312, 886244...",fancybox@3.5.7,,,"[[г., Сочи., Дом, утопает, в, зелени, субтропи..."
97,,,,,,
98,https://atmosfera32.ru,[4832770307],info@atmosfera32.ru,,,"[[г., Брянск,, ул., Войстроченко,, д., 3, оста..."
