In [84]:
import requests as rq
from bs4 import BeautifulSoup
import pandas as pd
import re
from requests.auth import HTTPDigestAuth
import os
import tqdm
import json
import datetime
from pandas.core.common import flatten
import traceback 
import webbrowser
from time import sleep

In [85]:
headers = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
           "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",  
          }

In [86]:
def proceed_data(file:str, pat_gost='(ГОСТ[ ]*[0-9]+)|(ИСО[ ]*[0-9]+)', pat_name='([а-яА-ЯёЁ]{4,})'):
    """
    =====================================================================
    
    Manipulate given nomenclature data in order to maximize the quality and utility of search requests 
    during the parsing process
    
    Parameters
    ----------
    file : nomenclature of products need to find.
    pat_gost : pattern that is used to determing GOSTs of products
    pat_name : pattern that is used to determing titles of products
    
    =====================================================================
    """
    nomenclature = pd.read_excel(file, index_col='№').rename(columns={'Наименование':'FullName'})
    nomenclature.FullName = nomenclature.FullName.str.strip('*').str.strip().str.replace('\xa0','')
    
    # Find all occurencies of GOSTs info in given data 
    gost = []
    for i in range(1, len(nomenclature)+1):
        a = nomenclature.FullName.loc[i]
        b = re.findall(pat_gost, a)
        gost.append(b if b else '')
    nomenclature["Gost"] = gost
    
    # Transform gained GOSTs to appropriate form
    flattened = []
    for i in range(1,len(nomenclature)+1):
        a = nomenclature.Gost.loc[i]
        b = list(flatten(a))
        flattened.append(', '.join([x for x in b if x]))
    nomenclature['Gost'] = flattened
    
    # Highlight and leave the most import ones of GOSTs
    fixed_gost = []
    old_gosts = list(nomenclature.Gost)
    old_gosts = [i.split(',')[0] if len(i.split(','))==1 else i.split(',')[1] for i in old_gosts]
    nomenclature['Gost'] = old_gosts
    for i in range(1,len(old_gosts)+1):
        a = nomenclature.Gost.loc[i]
        b = ''.join(a.split())
        c = re.split('(\d.*)', b)
        d = ' '.join(c).strip()
        fixed_gost.append(d)
    nomenclature['Gost'] = fixed_gost
    
    # Find all products' titles with no special info
    names = []
    for i in range(1, len(nomenclature)+1):
        a = nomenclature.FullName.loc[i]
        b = re.findall(pat_name, a)
        b.pop(b.index('ГОСТ'))
        names.append(' '.join(b).lower().capitalize())
    nomenclature['Titles'] = names
    return nomenclature

In [87]:
data = proceed_data('poiskpostav_v1.xlsx')

In [134]:
def searchQuery(nomenclature:pd.DataFrame, mode='all'):
    """
    =====================================================================
    
    Construct convenient search queries to increase accuracy of parsers information
    
    Parameters
    ----------
    nomenclature : transformed nomenclature of products need to find.
    mode : {'all', 'gost', 'reasonable'}, default 'all'
        Optional parameter for constructing search queries.
        If 'all' uses FullName of product for search query.
        If 'gost' uses Gost of product for search query.
        If 'reasonable' uses combination of Gost and Name for search query.
    
    =====================================================================
    
    """
    if mode == 'all':
        searchers = list(nomenclature.FullName)
    elif mode == 'gost':
        gosts = list(nomenclature.Gost)
        searchers = gosts
    elif mode == 'reasonable':
        gosts = list(nomenclature.Gost)
        titles = list(nomenclature.Titles)
        container = list(zip(gosts, titles))
        searchers = [' '.join(x) for x in container]
    return searchers


In [89]:
searcher = searchQuery(data,mode='gost')

In [90]:
def UrlParseApi(searcher, headers, log=True):
    """
    =====================================================================
    
    Collect urls of products from https://apipost.ru/ to use them to parse products' information
    
    Parameters
    ----------
    searcher : put created search queries from searchQuery()
    headers : headers that is used to connect to site
    log : information of process duration
    
    =====================================================================
    """
    urls = []
    products = dict()
    for product in tqdm.tqdm(range(0, len(searcher))):
        a = searcher[product]
        b = a.replace(' ', '+')
        c = f'https://apipost.ru/obyavleniya/catalog/param_text_search={b}'
        d = rq.get(c, headers=headers).text
        e = BeautifulSoup(d, 'html.parser')
        f = e.find_all('tr', {'class':'tr_tab'})
        for tag1 in range(len(f)):
            g = f[tag1].find('a', {'class':'zot'})
            products[product] = products.get(product, []) + [g.get('href')]
    return products


In [91]:
def UrlParseOpt(searcher, headers, log=True):
    """
    =====================================================================
    
    Collect urls of products from https://www.opt-union.ru/ to use them to parse products' information
    
    Parameters
    ----------
    searcher : put created search queries from searchQuery()
    headers : headers that is used to connect to site
    log : information of process duration
    
    =====================================================================
    """
    products = dict()
    urls = []
    for product in tqdm.tqdm(range(len(searcher))):
        for page in range(1,2):
            a = searcher[product]
            b = a.replace(' ', '%20')
            c = f'https://www.opt-union.ru/search.php?words={b}&searchtype=3&page={page}'
            d = rq.get(c, headers=headers).text
            e = BeautifulSoup(d, 'html.parser')
            f = e.find_all('div', {'class': 'data-block bordered'})
            for tag in range(len(f)):
                g = f[tag].find('div', {'class': 'h2'})
                if g:
                    h = g.find('a').get('href')
                    urls.append(f'https://www.opt-union.ru{h}')
                    products[product] = products.get(product, []) + [f'https://www.opt-union.ru{h}']
    return products

In [110]:
def UrlParseOb(searcher, headers, log=True):
    """
    =====================================================================
    
    Collect urls of products from https://www.oborudunion.ru/ to use them to parse products' information
    
    Parameters
    ----------
    searcher : put created search queries from searchQuery()
    headers : headers that is used to connect to the site
    log : information of process duration
    
    =====================================================================
    """
    products = dict()
    urls = []
    for product in tqdm.tqdm(range(len(searcher))):
        for page in range(1,2):
            a = searcher[product]
            b = a.replace(' ', '%20')
            c = f'https://www.oborudunion.ru/search.php?words={b}&searchtype=3&page={page}'
            d = rq.get(c, headers=headers).text
            e = BeautifulSoup(d, 'html.parser')
            f = e.find_all('div', {'class': 'data-block bordered'})
            for tag in range(len(f)):
                g = f[tag].find('div', {'class': 'h2'})
                if g:
                    h = g.find('a').get('href')
                    urls.append(f'https://www.oborudunion.ru{h}')
                    products[product] = products.get(product, []) + [f'{h}']
    return products

In [92]:
productsOpt = UrlParseOpt(searcher, headers)

100%|██████████| 40/40 [00:52<00:00,  1.31s/it]


In [93]:
productsApi = UrlParseApi(searcher, headers)

100%|██████████| 40/40 [00:11<00:00,  3.60it/s]


In [111]:
productsOb = UrlParseOb(searcher,headers)

100%|██████████| 40/40 [00:15<00:00,  2.55it/s]


In [94]:
gosts = list(data.Gost)

In [172]:
def parse_optunion(products, gosts, headers):
    """    
    =====================================================================
    
    Collect products' information from https://www.opt-union.ru/
    
    Parameters
    ----------
    products : use gained from UrlParseOPT() urls to gather the product information
    gosts : GOSTs' names to structure the return value
    headers : headers that is used to connect to the site
    
    =====================================================================
    """

    dct = dict()
    start_time = datetime.datetime.now()
    for product in range(len(products)):
        pr_start = datetime.datetime.now()
        dct1 = dict()
        if product not in products:
            continue
        for x, url in enumerate(products[product]):
            response = rq.get(url, headers=headers)
            if response.status_code != 404:
                a = response.text
                b = BeautifulSoup(a, 'html.parser')
                c = b.find('div', {'class':'description'})
                d = b.find('div', {'class': 'title'}).find('a')
                j = b.find_all('div', {'class': 'warp_item'})
                if len(j) == 4:
                    e = j[2].find('div', {'class':'itemRight'})
                    f = j[3].find('div', {'class':'itemRight'})
                elif len(j) == 3:
                    e = j[1].find('div', {'class':'itemRight'})
                    f = j[2].find('div', {'class':'itemRight'})
                else:
                    e = j[3].find('div', {'class':'itemRight'})
                    f = j[4].find('div', {'class':'itemRight'})
                g = b.find('table', {'class':'main'}).find('h1')
                ad_name = g.text
                location = e.text
                firms = d.text
                number = f.text
                desc = c.text.strip()
            dct1[x] = {'ad_name':ad_name,'url':url, 'location': location, 'firm':firms, 'phone_number':number, 'desc':desc}
            print(f'{x+1}/{len(products[product])} urls of product {product+1} parsed')
        dct[gosts[product]] = dct1
        pr_end = datetime.datetime.now()
        print(f'{product+1}/{len(products)} products parsed, it took {round((pr_end-pr_start).total_seconds()/60, 1)} minutes')
    with open('resultOPT.json', 'w', encoding='UTF-8') as file:
            json.dump(dct, file, indent=4, ensure_ascii=False)
    end_time = datetime.datetime.now()
    total_time = end_time-start_time
    print(total_time)
    return dct

In [62]:
parse_optunion(productsOpt, gosts, headers)

In [173]:
def parse_apipost(products, gosts, headers):
    """    
    =====================================================================
    
    Collect products' information from https://apipost.ru/
    
    Parameters
    ----------
    products : use gained from UrlParseApi() urls to gather the product information
    gosts : GOSTs' names to structure the return value
    headers : headers that is used to connect to the site
    
    =====================================================================
    """
    pattern = '[0-9]{10,}'
    pattern1 = '[0-9]{2}\.[0-9]{2}\.[0-9]{4}'
    dct = dict()
    for product in tqdm.tqdm(range(len(products))):
        dct1 = dict()
        if product not in products:
            continue
        for x, url in enumerate(products[product]):
            a = rq.get(url, headers=headers).text
            b = BeautifulSoup(a, 'html.parser')
            c = b.find('span', {'class':'price'})
            d = b.find('span', {'style': 'color: #144986; font-size: 20px;'})
            e = b.find('b')
            f = b.find('img', {'class':'icnom'})
            g = b.find('table', {'border':"0", 'cellpadding':"3", 'cellspacing':"0", 'width':"100%"}).find('td').text
            h = re.findall(pattern, g)
            i = b.find('span', {'style': 'color: #dd0000; font-size: 18px;'})
            j = b.find('div', {'class':'opis-ads'})
            k = b.find_all('tr', {'valign':'middle'})
            l = k[4].find('td').text
            m = b.find('h1', {'class':'items'})
            ad_name = m.text
            location = i.text.strip()
            prices = c.text if c else None
            firms = d.text
            number = h[0]
            desc = j.text.replace('<br>', ' ').strip()
            reg_date = re.findall(pattern1, l)[0] if re.findall(pattern1, l) else None
            dct1[x] = {'ad_name':ad_name,'url':url, 'location': location, 'firm':firms, 'phone_number':number, 'desc':desc}
        dct[gosts[product]] = dct1
        with open('resultAPI.json', 'w', encoding='UTF-8') as file:
            json.dump(dct, file, indent=4, ensure_ascii=False)
    return dct

In [80]:
parse_apipost(productsApi, gosts, headers)

100%|██████████| 34/34 [00:10<00:00,  3.30it/s]


{'ГОСТ 22704': {0: {'ad_name': 'Манжеты шевронные ГОСТ 22704-77 в Самаре',
   'url': 'https://apipost.ru/obyavleniya/items/manzhety_shevronnye_gost_22704_77_v_samare_153619',
   'location': 'Самара, м. Алабинская',
   'price': '100 руб.',
   'firm': 'Антон Владимирович',
   'reg_date': '09.02.2022',
   'phone_number': '79967354339',
   'desc': 'Шевронные уплотнительные манжеты: отечественные изготовленные в соответствии с ГОСТ 22704-77 и импортные, разработанные и произведенные по европейским стандартам DIN и ISO, предназначены для уплотнения цилиндров и поршней (штоков, плунжеров) гидравлических устройств работающих с высокими эксплуатационными нагрузками. Конструктивно, шевронное уплотнение состоит из опрного кольца КО, нажимного кольца КН и пакета манжет – от двух до тридцати штук (чаще всего – три). Составная конструкция позволяет уплотнению работать в тяжелых условиях, поддерживать герметичность при повреждениях уплотняемой поверхности, работать в средах загрязненных абразивными п

In [174]:
def parse_ob(products, gosts, headers):
    """
    =====================================================================
    
    Collect products' information from https://www.oborudunion.ru/
    
    Parameters
    ----------
    products : use gained from UrlParseOb() urls to gather the product information
    gosts : GOSTs' names to structure the return value
    headers : headers that is used to connect to the site
    
    =====================================================================
    """
    dct = dict()
    start_time = datetime.datetime.now()
    for product in range(len(products)):
        pr_start = datetime.datetime.now()
        dct1 = dict()
        if product not in products:
            continue
        for x, url in enumerate(products[product]):
            response = rq.get(url, headers=headers)
            if response.status_code != 404:
                a = response.text
                b = BeautifulSoup(a, 'html.parser')
                c = b.find('div', {'class':'description'})
                d = b.find('div', {'class': 'title'}).find('a')

                j = b.find_all('div', {'class': 'warp_item'})
                if len(j) == 4:
                    e = j[2].find('div', {'class':'itemRight'})
                    f = j[3].find('div', {'class':'itemRight'})
                elif len(j) == 3:
                    e = j[1].find('div', {'class':'itemRight'})
                    f = j[2].find('div', {'class':'itemRight'})
                else:
                    e = j[3].find('div', {'class':'itemRight'})
                    f = j[4].find('div', {'class':'itemRight'})
#                     print(c.text.strip(), d.text, e.text, f.text, sep='\n')
                g = b.find('div', {'class':'main-content__left-long'}).find('h1')
                ad_name = g.text
                location = e.text
                firms = d.text if d is not None else ''
                number = f.text
                desc = c.text.strip()
            dct1[x] = {'ad_name':ad_name,'url':url, 'location': location, 'firm':firms, 'phone_number':number, 'desc':desc}
            print(f'{x+1}/{len(products[product])} urls of product {product+1} parsed')
        dct[gosts[product]] = dct1
        pr_end = datetime.datetime.now()
        print(f'{product+1}/{len(products)} products parsed, it took {round((pr_end-pr_start).total_seconds()/60, 1)} minutes')
    with open('resultOB.json', 'w', encoding='UTF-8') as file:
            json.dump(dct, file, indent=4, ensure_ascii=False)
    end_time = datetime.datetime.now()
    total_time = end_time-start_time
    print(total_time)
    return dct

In [None]:
parse_ob(productsOb, gosts, headers)

In [201]:
def concatJSON(json1, json2, json3):
    with open(json1, 'r', encoding='UTF-8') as f:
        a = json.load(f)
        a = pd.DataFrame(a).reset_index(drop=True)
    with open(json2, 'r', encoding='UTF-8') as f:
        b = json.load(f)
        b = pd.DataFrame(b).reset_index(drop=True)
    with open(json3, 'r', encoding='UTF-8') as f:
        c = json.load(f)
        c = pd.DataFrame(c).reset_index(drop=True)
    return pd.concat([a,b,c]).reset_index(drop=True)

In [202]:
def parse(nomenclature:pd.DataFrame, headers, mode='all', source='all', 
          pages=1, info='all', status_log=True):
    """
    =====================================================================
    
    Parse particular resources
    
    Parameters
    ----------
    nomenclature : pd.DataFrame, transformed nomenclature of products need to find
    headers : dict, headers that is used to connect to the site
    mode : {'all', 'gost', 'reasonable'}, default 'all'
        Optional parameter for constructing search queries.
        If 'all' uses FullName of product for search query.
        If 'gost' uses Gost of product for search query.
        If 'reasonable' uses combination of Gost and Name for search query.
    source : {'all', 'apipost', 'oborudunio', 'opt-union'}, default 'all'.
        Optional parameter for searching on particular sources.
        If 'all' uses all sources to parse.
        If 'apipost' uses apipost.ru site to parse.
        If 'oborudunion' uses oborudunion.ru site to parse.
        If 'opt-union' uses opt-union.ru site to parse.
    pages : number of pages to look through on sites
    info : bool, additional info on parsing process
    status_log : bool, parsing log
    
    =====================================================================
    """
    
    searcher = searchQuery(nomenclature, mode=mode)
    gosts = list(nomenclature.Gost)
    overall = dict()
    
    if source == 'all':
        
        productsApi = UrlParseApi(searcher, headers)
        productsOb = UrlParseOb(searcher, headers)
        productsOpt = UrlParseOpt(searcher, headers)
        parsedApi = parse_apipost(productsApi, gosts, headers)
        parsedOb = parse_ob(productsOb, gosts, headers)
        parsedOpt = parse_optunion(productsOpt, gosts, headers)
        overall = dict()
        
        return concatJSON('resultAPI.json', 'resultOB.json', 'resultOPT.json')
    
    elif source == 'apipost':
        
        productsApi = UrlParseApi(searcher, headers)
        parsedApi = parse_apipost(productsApi, gosts, headers)
        
        for gost in gosts:
            
            overall[gost] = pd.DataFrame(parsedApi[gost]).T.reset_index(drop=True).to_dict() if gost in parsedApi else ''
            
        return overall
    
    elif source == 'oborudunion':
        
        productsOb = UrlParseOb(searcher, headers)
        parsedOb = pd.DataFrame(parsedOb[gost]).T
    
        for gost in gosts:
            
            overall[gost] = pd.DataFrame(parsedOb[gost]).T.reset_index(drop=True).to_dict()
    
        return overall
        
    elif source == 'opt-union':
        
        productsOpt = UrlParseOpt(searcher, headers)
        parsedOpt = parse_optunion(productsOpt, gosts, headers)
        
        for gost in gosts:
            
            overall[gost] = pd.DataFrame(parsedOpt[gost]).T.reset_index(drop=True).to_dict()
    
        return overall
    
    else:
        raise NameError('Option is not supported')
        
    

In [203]:
dct = parse(data, headers=headers, mode='reasonable', source='all')
dct

100%|██████████| 40/40 [00:06<00:00,  6.16it/s]
100%|██████████| 40/40 [00:14<00:00,  2.67it/s]
100%|██████████| 40/40 [00:12<00:00,  3.14it/s]
100%|██████████| 10/10 [00:01<00:00,  9.44it/s]


1/21 urls of product 1 parsed
2/21 urls of product 1 parsed
3/21 urls of product 1 parsed
4/21 urls of product 1 parsed
5/21 urls of product 1 parsed
6/21 urls of product 1 parsed
7/21 urls of product 1 parsed
8/21 urls of product 1 parsed
9/21 urls of product 1 parsed
10/21 urls of product 1 parsed
11/21 urls of product 1 parsed
12/21 urls of product 1 parsed
13/21 urls of product 1 parsed
14/21 urls of product 1 parsed
15/21 urls of product 1 parsed
16/21 urls of product 1 parsed
17/21 urls of product 1 parsed
18/21 urls of product 1 parsed
19/21 urls of product 1 parsed
20/21 urls of product 1 parsed
21/21 urls of product 1 parsed
1/40 products parsed, it took 0.1 minutes
1/21 urls of product 2 parsed
2/21 urls of product 2 parsed
3/21 urls of product 2 parsed
4/21 urls of product 2 parsed
5/21 urls of product 2 parsed
6/21 urls of product 2 parsed
7/21 urls of product 2 parsed
8/21 urls of product 2 parsed
9/21 urls of product 2 parsed
10/21 urls of product 2 parsed
11/21 urls of p

21/21 urls of product 12 parsed
12/40 products parsed, it took 0.1 minutes
1/21 urls of product 13 parsed
2/21 urls of product 13 parsed
3/21 urls of product 13 parsed
4/21 urls of product 13 parsed
5/21 urls of product 13 parsed
6/21 urls of product 13 parsed
7/21 urls of product 13 parsed
8/21 urls of product 13 parsed
9/21 urls of product 13 parsed
10/21 urls of product 13 parsed
11/21 urls of product 13 parsed
12/21 urls of product 13 parsed
13/21 urls of product 13 parsed
14/21 urls of product 13 parsed
15/21 urls of product 13 parsed
16/21 urls of product 13 parsed
17/21 urls of product 13 parsed
18/21 urls of product 13 parsed
19/21 urls of product 13 parsed
20/21 urls of product 13 parsed
21/21 urls of product 13 parsed
13/40 products parsed, it took 0.1 minutes
1/21 urls of product 14 parsed
2/21 urls of product 14 parsed
3/21 urls of product 14 parsed
4/21 urls of product 14 parsed
5/21 urls of product 14 parsed
6/21 urls of product 14 parsed
7/21 urls of product 14 parsed
8/

13/21 urls of product 24 parsed
14/21 urls of product 24 parsed
15/21 urls of product 24 parsed
16/21 urls of product 24 parsed
17/21 urls of product 24 parsed
18/21 urls of product 24 parsed
19/21 urls of product 24 parsed
20/21 urls of product 24 parsed
21/21 urls of product 24 parsed
24/40 products parsed, it took 0.1 minutes
1/21 urls of product 25 parsed
2/21 urls of product 25 parsed
3/21 urls of product 25 parsed
4/21 urls of product 25 parsed
5/21 urls of product 25 parsed
6/21 urls of product 25 parsed
7/21 urls of product 25 parsed
8/21 urls of product 25 parsed
9/21 urls of product 25 parsed
10/21 urls of product 25 parsed
11/21 urls of product 25 parsed
12/21 urls of product 25 parsed
13/21 urls of product 25 parsed
14/21 urls of product 25 parsed
15/21 urls of product 25 parsed
16/21 urls of product 25 parsed
17/21 urls of product 25 parsed
18/21 urls of product 25 parsed
19/21 urls of product 25 parsed
20/21 urls of product 25 parsed
21/21 urls of product 25 parsed
25/40 

5/21 urls of product 36 parsed
6/21 urls of product 36 parsed
7/21 urls of product 36 parsed
8/21 urls of product 36 parsed
9/21 urls of product 36 parsed
10/21 urls of product 36 parsed
11/21 urls of product 36 parsed
12/21 urls of product 36 parsed
13/21 urls of product 36 parsed
14/21 urls of product 36 parsed
15/21 urls of product 36 parsed
16/21 urls of product 36 parsed
17/21 urls of product 36 parsed
18/21 urls of product 36 parsed
19/21 urls of product 36 parsed
20/21 urls of product 36 parsed
21/21 urls of product 36 parsed
36/40 products parsed, it took 0.1 minutes
1/21 urls of product 37 parsed
2/21 urls of product 37 parsed
3/21 urls of product 37 parsed
4/21 urls of product 37 parsed
5/21 urls of product 37 parsed
6/21 urls of product 37 parsed
7/21 urls of product 37 parsed
8/21 urls of product 37 parsed
9/21 urls of product 37 parsed
10/21 urls of product 37 parsed
11/21 urls of product 37 parsed
12/21 urls of product 37 parsed
13/21 urls of product 37 parsed
14/21 urls 

1/21 urls of product 8 parsed
2/21 urls of product 8 parsed
3/21 urls of product 8 parsed
4/21 urls of product 8 parsed
5/21 urls of product 8 parsed
6/21 urls of product 8 parsed
7/21 urls of product 8 parsed
8/21 urls of product 8 parsed
9/21 urls of product 8 parsed
10/21 urls of product 8 parsed
11/21 urls of product 8 parsed
12/21 urls of product 8 parsed
13/21 urls of product 8 parsed
14/21 urls of product 8 parsed
15/21 urls of product 8 parsed
16/21 urls of product 8 parsed
17/21 urls of product 8 parsed
18/21 urls of product 8 parsed
19/21 urls of product 8 parsed
20/21 urls of product 8 parsed
21/21 urls of product 8 parsed
8/40 products parsed, it took 0.1 minutes
1/21 urls of product 9 parsed
2/21 urls of product 9 parsed
3/21 urls of product 9 parsed
4/21 urls of product 9 parsed
5/21 urls of product 9 parsed
6/21 urls of product 9 parsed
7/21 urls of product 9 parsed
8/21 urls of product 9 parsed
9/21 urls of product 9 parsed
10/21 urls of product 9 parsed
11/21 urls of p

16/21 urls of product 19 parsed
17/21 urls of product 19 parsed
18/21 urls of product 19 parsed
19/21 urls of product 19 parsed
20/21 urls of product 19 parsed
21/21 urls of product 19 parsed
19/40 products parsed, it took 0.1 minutes
1/21 urls of product 20 parsed
2/21 urls of product 20 parsed
3/21 urls of product 20 parsed
4/21 urls of product 20 parsed
5/21 urls of product 20 parsed
6/21 urls of product 20 parsed
7/21 urls of product 20 parsed
8/21 urls of product 20 parsed
9/21 urls of product 20 parsed
10/21 urls of product 20 parsed
11/21 urls of product 20 parsed
12/21 urls of product 20 parsed
13/21 urls of product 20 parsed
14/21 urls of product 20 parsed
15/21 urls of product 20 parsed
16/21 urls of product 20 parsed
17/21 urls of product 20 parsed
18/21 urls of product 20 parsed
19/21 urls of product 20 parsed
20/21 urls of product 20 parsed
21/21 urls of product 20 parsed
20/40 products parsed, it took 0.1 minutes
1/21 urls of product 21 parsed
2/21 urls of product 21 pars

8/21 urls of product 31 parsed
9/21 urls of product 31 parsed
10/21 urls of product 31 parsed
11/21 urls of product 31 parsed
12/21 urls of product 31 parsed
13/21 urls of product 31 parsed
14/21 urls of product 31 parsed
15/21 urls of product 31 parsed
16/21 urls of product 31 parsed
17/21 urls of product 31 parsed
18/21 urls of product 31 parsed
19/21 urls of product 31 parsed
20/21 urls of product 31 parsed
21/21 urls of product 31 parsed
31/40 products parsed, it took 0.1 minutes
1/21 urls of product 32 parsed
2/21 urls of product 32 parsed
3/21 urls of product 32 parsed
4/21 urls of product 32 parsed
5/21 urls of product 32 parsed
6/21 urls of product 32 parsed
7/21 urls of product 32 parsed
8/21 urls of product 32 parsed
9/21 urls of product 32 parsed
10/21 urls of product 32 parsed
11/21 urls of product 32 parsed
12/21 urls of product 32 parsed
13/21 urls of product 32 parsed
14/21 urls of product 32 parsed
15/21 urls of product 32 parsed
16/21 urls of product 32 parsed
17/21 ur

Unnamed: 0,ГОСТ 22704,ГОСТ 7338,ГОСТ 5398,ГОСТ 18698,ГОСТ 8752,ГОСТ 9356,ГОСТ 7796,ИСО 4014,ГОСТ 5915,ГОСТ 4028,ГОСТ 7817,ГОСТ 7805,ГОСТ 7808,ГОСТ 5916,ГОСТ 15521,ГОСТ 10450,ГОСТ 6958,ГОСТ 7798
0,{'ad_name': 'Манжеты шевронные ГОСТ 22704-77 в...,{'ad_name': 'Резинотехнические пластины ГОСТ 7...,{'ad_name': 'Пожарные рукава - в Санкт-Петербу...,{'ad_name': 'Рукав напорный с текстильным карк...,,,,,,,,,,,,,,
1,"{'ad_name': 'Срочное изготовление уплотнений, ...",,,,,,,,,,,,,,,,,
2,{'ad_name': 'Уплотнения шевронные резинотканев...,{'ad_name': 'Техническая пластина ГОСТ 7338-90...,{'ad_name': 'Рукав напорно-всасывающий Б-2-65...,{'ad_name': 'Рукав резиновый напорный с тексти...,{'ad_name': 'Манжеты резиновые армированные дл...,{'ad_name': 'Рукав резиновый напорный ГОСТ 93...,"{'ad_name': 'Болт с шестигранной головкой, оци...",{'ad_name': 'Глубокая шестигранная головка для...,"{'ad_name': 'Гайка шестигранная, ГОСТ 5915-70,...","{'ad_name': 'Строительный гвоздь креп-комп 2, ...","{'ad_name': 'Болт с шестигранной головкой, оци...",{'ad_name': 'Болт А2 М 10х25 нержавеющая сталь...,"{'ad_name': 'Болт с шестигранной головкой, оци...","{'ad_name': 'Гайка шестигранная низкая, с фаск...","{'ad_name': 'класс прочности 4.8, оцинкованная...","{'ad_name': 'Шайба плоская, ГОСТ 11371-80', 'u...",{'ad_name': 'DIN 9021 Шайба плоская увеличенна...,"{'ad_name': '""Болт ГОСТ 7798-70, 7805-70 (в т...."
3,{'ad_name': 'Уплотнение ГОСТ 22704-77 из полиу...,{'ad_name': 'Пластина техническая маслобензост...,{'ad_name': 'Рукав напорно-всасывающий (гофра)...,{'ad_name': 'Рукава напорные с текстильным кар...,{'ad_name': 'Манжеты армированные по ГОСТ 8752...,"{'ad_name': 'Рукав III-9,0-2,0 ГОСТ 9356-75', ...","{'ad_name': 'Гайка ГОСТ 5915-70, 5927 (в т.ч. ...",{'ad_name': 'Глубокая шестигранная головка для...,"{'ad_name': 'Гайка шестигранная ГОСТ 5927-70, ...","{'ad_name': 'Строительный гвоздь креп-комп 3, ...","{'ad_name': 'Гайка ГОСТ 5915-70, 5927 (в т.ч. ...","{'ad_name': 'Болт М 12х35 ГОСТ 7805 (кг)', 'ur...","{'ad_name': 'Гайка ГОСТ 5915-70, 5927 (в т.ч. ...","{'ad_name': 'Гайка ГОСТ 5916-70 (низкая)', 'ur...",{'ad_name': 'Шестигранная гайка зубр мастер го...,"{'ad_name': 'Шайба плоская ГОСТ 11371-78, DIN ...",{'ad_name': 'ГОСТ 6958-78 – шайба плоская увел...,{'ad_name': 'Оцинкованный болт с шестигранной ...
4,{'ad_name': 'Уплотнения шевронные резинотканев...,{'ad_name': 'Техпластина 2Н -I -ТМКЩ-С 5 мм ГО...,{'ad_name': 'рукав напорно-всасывающий ГОСТ 53...,{'ad_name': 'Напорные рукава с текстильным кар...,{'ad_name': 'Сальник (армированная манжета) фи...,{'ad_name': 'рукава для газовой сварки ГОСТ 93...,"{'ad_name': 'Болт с шестигранной головкой, оци...",{'ad_name': 'Глубокая шестигранная головка для...,{'ad_name': 'DIN 934 Гайка шестигранная с круп...,{'ad_name': 'Строительный гвоздь креп-комп 3х7...,"{'ad_name': 'Болт с шестигранной головкой, оци...",{'ad_name': 'Болт А2 М 6х30 нержавеющая сталь ...,"{'ad_name': 'Болт с шестигранной головкой, оци...","{'ad_name': 'Гайка шестигранная, ГОСТ 5915-70,...","{'ad_name': '""Гайка ГОСТ 10605-94 (в т.ч. клас...","{'ad_name': 'DIN 125 Шайба плоская ISO 7089, Г...","{'ad_name': 'Шайба плоская, ГОСТ 11371-80', 'u...",{'ad_name': 'Болт с шестигранной головкой зубр...
5,{'ad_name': 'Комплект сит для твердого биотопл...,"{'ad_name': 'Пластины маслостойкие, маслобензо...",{'ad_name': 'шланг резиновый всасывающий ГОСТ ...,{'ad_name': 'Рукава резиновые напорные с текст...,"{'ad_name': 'Манжета армированная (сальник)', ...",{'ad_name': 'Напорные рукава для резки металло...,{'ad_name': 'Болт класс прочности 5.8 ГОСТ 779...,{'ad_name': 'Глубокая шестигранная головка для...,{'ad_name': 'Гайка шестигранная оцинкованная М...,"{'ad_name': 'Строительный гвоздь креп-комп 2, ...","{'ad_name': 'класс прочности 4.8, оцинкованная...","{'ad_name': 'Болт М 8х10 цинк ГОСТ 7805 (кг)',...",{'ad_name': 'Болт класс прочности 8.8. ГОСТ 77...,"{'ad_name': 'Гайка шестигранная ГОСТ 5927-70, ...",{'ad_name': 'Гайка класс прочности 8.0. ГОСТ 5...,"{'ad_name': 'Шайба плоская ГОСТ 11371-70', 'ur...",{'ad_name': 'Шайба увеличенная - ЕКТ МШ9021 (D...,{'ad_name': 'Болт с шестигранной головкой зубр...
6,{'ad_name': 'Комплект сит для огнеупорных глин...,"{'ad_name': 'Дробь чугунная литая, колотая ГОС...",{'ad_name': 'Рукава резиновые напорно-всасываю...,{'ad_name': 'Рукава резиновые с текстильным ка...,"{'ad_name': 'Трубка резиновая ГОСТ 5496-78', '...",{'ad_name': 'Рукава напорные для газовой сварк...,{'ad_name': 'Болт с шестигранной головкой зубр...,{'ad_name': 'Глубокая шестигранная головка для...,"{'ad_name': 'Гайка шестигранная ГОСТ 5915-70',...",{'ad_name': 'Строительный гвозди зубр мастер г...,{'ad_name': 'Болт с шестигранной головкой зубр...,{'ad_name': 'Болт А2 М 8х20 нержавеющая сталь ...,{'ad_name': 'Болт класс прочности 5.8 ГОСТ 779...,{'ad_name': 'Гайка шестигранная оцинкованная М...,"{'ad_name': 'Гайка ГОСТ 5915-70, 5927 (в т.ч. ...",{'ad_name': 'DIN 9021 Шайба плоская увеличенна...,"{'ad_name': 'Шайба М 30 цинк ГОСТ 6958 (кг)', ...","{'ad_name': 'Болт 10*35 ГОСТ 7798-70', 'url': ..."
7,{'ad_name': 'Комплект сит для карбоната кальци...,"{'ad_name': 'Техпластина МБС ГОСТ 7338-90', 'u...",{'ad_name': 'Рукава напорно-всасывающие ГОСТ 5...,{'ad_name': 'Рукава резиновые напорные с текст...,"{'ad_name': 'Трубка резиновая ГОСТ 5496', 'url...",{'ad_name': 'Рукава для газовой сварки ГОСТ 93...,{'ad_name': 'Болт с шестигранной головкой зубр...,{'ad_name': 'Глубокая шестигранная головка для...,"{'ad_name': 'Гайка шестигранная, ГОСТ 5915-70,...","{'ad_name': '5х150 ГОСТ 4028-63 (5 кг)', 'url'...",{'ad_name': 'Болт с шестигранной головкой зубр...,{'ad_name': 'Болт М 8х100 цинк ГОСТ 7805 кл. п...,{'ad_name': 'Болт с шестигранной головкой зубр...,{'ad_name': 'Гайка шестигранная М20 кл.пр.10 Г...,{'ad_name': 'Гайка Гост 52645-06 класс прочнос...,{'ad_name': 'Шайба плоская Гост 52646-2006 выс...,"{'ad_name': 'Шайба М 27 ГОСТ 6958 (кг)', 'url'...",{'ad_name': 'Болт М 8х100 цинк ГОСТ 7798 (кг)'...
8,{'ad_name': 'Комплект сит для удобрений и изве...,{'ad_name': 'Пластина техническая пористая ТУ ...,{'ad_name': 'Рукава напорно-всасывающие ГОСТ 5...,{'ad_name': 'Рукава резиновые напорные с текст...,"{'ad_name': 'Резиновая трубка ГОСТ 5496 78', '...",{'ad_name': 'Рукава для газовой сварки ГОСТ 93...,{'ad_name': 'Шестигранная гайка зубр мастер го...,{'ad_name': 'Головка торцевая ударная глубокая...,"{'ad_name': 'Гайка шестигранная ГОСТ 5915-70',...","{'ad_name': '6х200 ГОСТ 4028-63 (5 кг)', 'url'...",{'ad_name': 'Шестигранная гайка зубр мастер го...,{'ad_name': 'Болт А4 М 6х20 нержавеющая сталь ...,{'ad_name': 'Болт с шестигранной головкой зубр...,"{'ad_name': 'Гайка низкая ГОСТ 5916-70, 5929-7...",{'ad_name': 'Болт с шестигранной головкой зубр...,"{'ad_name': 'Шайба ГОСТ 10450', 'url': 'https:...","{'ad_name': 'Шайба плоская ГОСТ 11371-70', 'ur...",{'ad_name': 'Болт М 12х35 ГОСТ 7798 кл. пр. 8....
9,{'ad_name': 'Комплект лабораторных сит для гру...,{'ad_name': 'Пластина техническая МБС (техплас...,{'ad_name': 'Рукава напорно-всасывающие с мета...,{'ad_name': 'Рукава резиновые с текстильным ка...,{'ad_name': 'Трубка резиновая ТМКЩ ГОСТ 5496-7...,{'ad_name': 'Рукава для газовой сварки ГОСТ 93...,{'ad_name': 'Болт М 12х35 цинк ГОСТ 7796 (кг)'...,{'ad_name': 'Головка торцевая ударная глубокая...,"{'ad_name': 'гайка м 10 14Х17Н2 ГОСТ 5915', 'u...","{'ad_name': '4х100 ГОСТ 4028-63 (5 кг)', 'url'...",{'ad_name': 'Болт ГОСТ 7817-80. Изготовление и...,{'ad_name': 'Болт А4 М 8х16 нержавеющая сталь ...,{'ad_name': 'Шестигранная гайка зубр мастер го...,{'ad_name': 'DIN 934 Гайка шестигранная с круп...,{'ad_name': 'Болт с шестигранной головкой зубр...,"{'ad_name': 'шайба плоская ГОСТ 11371-78', 'ur...","{'ad_name': 'Шайба плоская ГОСТ 11371-78, DIN ...","{'ad_name': 'Болт ГОСТ 7798-70', 'url': 'https..."


In [223]:
def identify(data, GOST):
    """
    =====================================================================
    
    Collect names of firms and theirs head office city
    
    Parameters
    ----------
    data : pd.DataFrame, parsed data from parse() function input
    GOST : str, particular GOST of product we want to get
    
    =====================================================================
    """
    
    companies = []
    cities = []
    for i in range(len(data[GOST])):
        
        companies.append(data[GOST][i]['firm'])
        city = data[GOST][i]['location'].split(',')[0]
        cities.append(city if '7' not in city else '' )
        
    return companies, cities


In [230]:
def CompanyGetInfo(querys, cities, method_search='city'):
    
    """
    =====================================================================
    
    Get general information about company bussiness and match it
    
    Parameters
    ----------
    querys : list, companies to match
    cities : str, city to match company
    method_search : str, method to check company
    
    =====================================================================
    """
    
    s = rq.Session()
    n = 0
    
    p = 0

    for item in querys:
        
        city = cities[p]
        p += 1
        
        query = querys[n]
        r = s.get("https://egrul.nalog.ru/index.html",
            headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
                "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
                }
            )

        req = rq.Request(
            'POST',
            'https://egrul.nalog.ru/',
            data=b'vyp3CaptchaToken=&page=&query='+query.encode()+b'&region=&PreventChromeAutocomplete=', 
            headers = {
            "Host": "egrul.nalog.ru",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding": "gzip, deflate, br",
            "Referer": "https://egrul.nalog.ru/index.html",
            "Content-Type": "application/x-www-form-urlencoded",
            "X-Requested-With": "XMLHttpRequest"
            }
            )

        r = s.prepare_request(req)
        r = s.send(r)
    #     print('31',r.text)
        item = json.loads(r.text)
        try:
            if item["ERRORS"] != '' and (item["ERRORS"])["captchaSearch"] != '':
                while True:
                    r = s.get('https://egrul.nalog.ru/captcha-dialog.html',
                    headers = {
                        "Host": "egrul.nalog.ru",
                        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
                        "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
                        "Referer": "https://egrul.nalog.ru/index.html",
                        "Pragma": "no-cache",
                        "Cache-Control": "no-cache"
                    })
                    b = BeautifulSoup(r.content.decode(),features="lxml").find('div',class_='field-data').find('img').get('src')
                    #print('\r\n\r\nb =',b,'\r\n\r\n')
                    webbrowser.open('https://egrul.nalog.ru' + b)
                    ct = b.split('?a=')[1].split('&')[0]
                    captcha = input('Введите капчу: ')
                    #print('ct=',ct)

                    r = rq.Request(
                    'POST',
                    'https://egrul.nalog.ru/captcha-proc.json',
                    data=b'captcha='+captcha.encode()+b'&captchaToken='+ct.encode(), 
                    headers = {
                        "Host": "egrul.nalog.ru",
                        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
                        "Accept": "application/json, text/javascript, */*; q=0.01",
                        "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
                        "Accept-Encoding": "gzip, deflate",
                        "Referer": "https://egrul.nalog.ru/index.html"
                        }
                    )

                    r = s.prepare_request(req)
                    r = s.send(r)
                    #print('captcha r', r.text)
                    item = json.loads(r.text)

                    try:
                        tr = False
                        if item["ERRORS"] != '':
                            tr = True
                    except Exception as e:
                        print(e)
                        pass
                    if tr == False: break


        except Exception as e:
            #print(e)
            pass
        t = json.loads(r.text)['t']

        sleep(0.5)

        r = s.get("https://egrul.nalog.ru/search-result/"+str(t),
            headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
                "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
                "Referer": "https://egrul.nalog.ru/index.html"
                }
            )

        print('95', r.text)
        jsn = json.loads(r.text)

        try:
            if method_search == "city":
                i = 0
                while i < len(jsn["rows"]):

                    address_check = jsn["rows"][i]["a"].replace(",", " ").replace(".", " ").lower().split()
                    city_check = city.lower()
                    if city_check in address_check:
                        jsn["rows"][0] = jsn["rows"][i]
                    i += 1
                if len(jsn["rows"]) > 1:
                    jsn["rows"] = [jsn["rows"][0]]
                if len(jsn["rows"]) == 0:
                    continue
                jsn["rows"][0]["pg"] = '1'
                jsn["rows"][0]["tot"] = '8487671'
        except:
            pass


        try:
            while True:
                if jsn['status'] != 'wait': break
                sleep(0.2)
        except Exception:
            pass

        try:
            item = (jsn["rows"])[0]
            itemParse = ''

            itemParse += item['n'] + '\n'
            itemParse += item['g'] + '\n'
            itemParse += 'Адрес: ' + item['a']+'\n'
            itemParse += 'ИНН: ' + item['i']+'\n'
            itemParse += 'ОГРН: ' + item['o']+'\n'
            itemParse += 'КПП: ' + item['p']+'\n'
            itemParse += 'Дата регистрации: ' + item['r']+'\n'

            try:
                itemParse += 'ДАТА ПРЕКРАЩЕНИЯ ДЕЯТЕЛЬНОСТИ: ' + item['e']+'\n'
            except Exception:
                pass

            if str(item['tot']) != '0':
                if len(item['n']) < 50: name = str(item['n'])
                else: name = str(item['i'])

                name = name.replace('"',"'").replace('\\','⧵').replace('/','⁄').replace('|','¦').replace(':',';').replace('*','✱').replace('?','').replace('<','«').replace('>','»')

                try:
                    os.mkdir(name)
                except Exception:
                    pass
                    name = name + ' '+str(datetime.datetime.strftime(datetime.datetime.now(datetime.timezone(datetime.timedelta(hours=3))),'%x %X %Z')).replace('/','.').replace(':','-')
                    os.mkdir(name)

                f = open(name+'\\'+name+'.txt','w+',encoding='utf-8')
                f.write('по состоянию на ' + str(datetime.datetime.strftime(datetime.datetime.now(datetime.timezone(datetime.timedelta(hours=3))),'%x %X %Z')).replace('/','.')+'\n'+str(itemParse))
                f.close()

                t = item['t']
                
                headers={
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
                    "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
                    "Referer": "https://egrul.nalog.ru/index.html"
                    }
                
                r = s.get("https://egrul.nalog.ru/vyp-request/"+str(t), headers=headers)
                sleep(0.5)
                while True:
                    r = s.get("https://egrul.nalog.ru/vyp-status/"+str(t), headers=headers)
                    st = json.loads(r.text)['status']
                    if st == 'ready': break
                    sleep(0.5)

                r = s.get("https://egrul.nalog.ru/vyp-download/"+str(t), headers=headers)

    #             print(r.text)

                f = open(name+'\\'+name+' выписка.pdf','wb+')
                f.write(r.content)
                f.close()
                n += 1
                sleep(5)

        except Exception as e:
            #print(e)
            traceback.print_exc()
            pass
        
        return jsn

In [None]:
def range(company_info):
    company_info = pd.DataFrame(company_info)
    company_info["Уставной капитал"] = company_info["Уставной капитал"].astype("float64")

    ust_cof = 5/company_info["Уставной капитал"].max()
    company_info["баллы"] = 0
    for el_number in range(len(company_info["баллы"])):
        company_info["баллы"][el_number] += company_info["Уставной капитал"][el_number] * ust_cof
    company_info.to_excel(company_info.xls)
    
    return company_info 

In [None]:
headers = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
           "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",  
          }

In [None]:
data = proceed_data('poiskpostav_v1.xlsx')

In [None]:
searcher = searchQuery(data,mode='gost')

In [None]:
dct = parse(data, headers=headers, mode='reasonable', source='all')

In [None]:
querys, cities = identify(dct, 'ГОСТ 22704')

In [None]:
queries = [[x] for x in querys]

In [None]:
jsn = CompanyGetInfo(queries[0], cities[0], method_search='city')

In [None]:
import PyPDF2
pdfFiles = []

for filename in os.listdir('.'):
    if filename.endswith('.pdf'):
        pdfFiles.append(filename)
pdfFiles.sort()

pdfWriter = PyPDF2.PdfFileWriter()

for filename in pdfFiles:
    pdfFileObj = open(filename, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

    for pageNum in range(0, 6):
        pageObj = pdfReader.getPage(pageNum)
        pdfWriter.addPage(pageObj)

pdfOutput = open('all-small.pdf', 'wb')
pdfWriter.write(pdfOutput)
pdfOutput.close()

In [None]:
import fitz 
import os

filename = "all-small.pdf"  
search_term = 'П'  
pdf_document = fitz.open(filename)

for current_page in range(len(pdf_document)):  
    page = pdf_document.loadPage(current_page)
    if page.searchFor(search_term):
        print("%s найдено на странице %i" % (search_term, current_page+1))

In [None]:
text = {}
with fitz.open('all-small.pdf') as doc:
    for num, page in enumerate(doc.pages()):
        text[num] = page.get_text()
        

In [None]:
import re
ter = str(text)

pattern_company = 'nСокращенное наименование на русскомnязыкеnООО (.*?)n'
pattern_ustcapital = "nРазмер в рубляхn(.*?)n" 
pattern_mesto = 'nМесто нахождения юридического лицаn(.*?)n'
pattern_data = 'nДата регистрацииn(.*?)n'

company_info = []

ter = ter.replace('\\','')
ter = ter.replace(')','')
ter = ter.replace('(','')

for i in re.findall("(?s)ВЫПИСКА(.*?)Сведения о записях, внесенных в Единый государственный реестр юридических лиц", ter):
    company_info.append({ "Компания": ''.join(re.findall(pattern_company, i)), "Уставной капитал": ''.join(re.findall(pattern_ustcapital, i)), 
                        "Место нахождения юридического лица": ''.join(re.findall(pattern_mesto, i)), "Дата регистрации": ''.join(re.findall(pattern_data, i)) 
                        })
print(company_info)
    
    

re.findall(pattern, ter)

In [None]:
pattern_data = 'nДата регистрацииn(.*?)n'
for i in re.findall("(?s)ВЫПИСКА(.*?)Сведения о записях, внесенных в Единый государственный реестр юридических лиц", ter):
    company_info.append({"Дата регестрации": re.findall(pattern_data, i)})
print(company_info)


In [None]:

for i in re.findall("(?s)ВЫПИСКА(.*?)Сведения о записях, внесенных в Единый государственный реестр юридических лиц", ter):
    company_info.append({"Место нахождения": re.findall(pattern_mesto, i)})
print(company_info)


In [None]:
info = range(jsn)

In [None]:
company_info["баллы"] = 0
company_info