In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
import re
import encodings

In [2]:
number_pattern = re.compile("\w*\d+\w*")
article_pattern = re.compile("\w*[d+,-.]\w+")

In [3]:
df = pd.read_csv("/home/rsolomatin/metalurgi/companies_info_v2.csv", index_col=0)
df["ИНН"] = df["ИНН"].astype(str)
df.shape

(7321, 8)

In [28]:
df.head()

Unnamed: 0,Индекс,Название,ИНН,ОГРН,Путь,ВК,Статус,Время
0,1,"ЕВРОКОЛОР, ООО",352526947648.0,1153525047092,"Sber_parser/1-101/data/ЕВРОКОЛОР, ООО_0",,-1,0.0
1,2,"ВЮМ, ООО",772503503336.0,1147746090887,"Sber_parser/1-101/data/ВЮМ, ООО_0",,1,1.529426
2,3,"НОПРОФ, ООО",781110795023.0,1037825017340,"Sber_parser/1-101/data/НОПРОФ, ООО_0",,1,5.654249
3,4,"СТАЙЕР, ООО МПК",741111765876.0,1117411000882,"Sber_parser/1-101/data/СТАЙЕР, ООО МПК_0",,-1,0.0
4,5,"ЗАМОК, ООО",590400687580.0,1105904005833,"Sber_parser/1-101/data/ЗАМОК, ООО_0",,-1,0.0


In [5]:
len(set(df["ИНН"])), len(set(df["Название"]))

(6617, 6391)

In [5]:
%%time

path = "/home/rsolomatin/metalurgi/Sber_parser"
path_inn = {path: inn for path, inn in zip(df["Путь"], df["ИНН"])}
data = []
for parent_directory in os.listdir(path):
    main_dir = os.path.join(path, f"{parent_directory}/data")
    for directory in tqdm(os.listdir(main_dir), desc=parent_directory):
        df_path = os.path.join("Sber_parser", parent_directory, "data", directory)
        inn = path_inn.get(df_path, None)
        subdir = os.path.join(main_dir, directory)
        for file in os.listdir(subdir):
            file_path = os.path.join(subdir, file)
            if not os.path.isfile(file_path):
                continue
            with open(file_path) as f:
                lines = f.readlines()
                for i, line in enumerate(lines):
                    words = line.split()
                    if len(words) <= 3:
                        continue
                    final_line = []
                    for word in words:
                        word = number_pattern.sub(r"", word)
                        word = article_pattern.sub(r"", word)
                        final_line.append(word)
                    data.append(
                        {
                            "path": file_path,
                            "inn": inn,
                            "sent_num": i,
                            "line": " ".join(final_line),
                        }
                    )

801-901: 100%|██████████| 42/42 [00:04<00:00,  8.48it/s]
6401-6601: 100%|██████████| 71/71 [00:16<00:00,  4.37it/s]
4001-4201: 100%|██████████| 88/88 [00:22<00:00,  3.86it/s]
6201-6401: 100%|██████████| 78/78 [00:08<00:00,  9.33it/s]
5201-5401: 100%|██████████| 78/78 [00:12<00:00,  6.24it/s]
3401-3601: 100%|██████████| 102/102 [00:26<00:00,  3.92it/s]
1201-1401: 100%|██████████| 96/96 [00:22<00:00,  4.32it/s]
1-101: 100%|██████████| 48/48 [00:20<00:00,  2.33it/s]
1001-1201: 100%|██████████| 105/105 [00:54<00:00,  1.94it/s]
201-301: 100%|██████████| 34/34 [00:05<00:00,  6.13it/s]
4801-5001: 100%|██████████| 75/75 [00:10<00:00,  7.18it/s]
5401-5601: 100%|██████████| 78/78 [00:17<00:00,  4.37it/s]
1601-1801: 100%|██████████| 86/86 [00:24<00:00,  3.57it/s]
2401-2601: 100%|██████████| 88/88 [00:27<00:00,  3.25it/s]
4201-4401: 100%|██████████| 82/82 [01:03<00:00,  1.28it/s]
401-501: 100%|██████████| 35/35 [00:14<00:00,  2.39it/s]
5001-5201: 100%|██████████| 85/85 [00:23<00:00,  3.63it/s]
680

CPU times: user 12min 20s, sys: 16.6 s, total: 12min 37s
Wall time: 15min 22s





In [6]:
len(data)

27415533

In [7]:
%%time
pd.DataFrame(data).to_csv("parsed_metalurgs.csv", index=False)

CPU times: user 2min 5s, sys: 8.7 s, total: 2min 14s
Wall time: 3min 2s


In [52]:
def get_company_first_sent(df):
    sent = df.groupby("path")["line"].first().reset_index()
    sent["company"] = sent["path"].str.split("/").str[-2]
    sent["folder"] = sent["path"].str.split("/").str[5]
    sent = sent.groupby("company").first().reset_index()
    sent["id"] = sent["company"].str.split("_").str[-2:]
    return sent

In [54]:
%%time
bad_companies = get_company_first_sent(pd.read_csv("parsed_metalurgs-Copy1.csv"))
good_companies = get_company_first_sent(pd.read_csv("parsed_metalurgs.csv"))

CPU times: user 2min 5s, sys: 5.12 s, total: 2min 11s
Wall time: 2min 22s


In [55]:
good_companies

Unnamed: 0,company,path,line,folder,id
0,"2 КУЛЬТУРЫ, ООО_1207800171061_0",/home/rsolomatin/metalurgi/Sber_parser/3601-38...,""" культуры Производство и оптовые поставки тов...",3601-3801,"[1207800171061, 0]"
1,"268 МЗ, ООО_1163702081476_0",/home/rsolomatin/metalurgi/Sber_parser/1001-12...,"""Главная, Металлообрабатывающий завод""",1001-1201,"[1163702081476, 0]"
2,"4 ИН, ООО_1207700112047_0",/home/rsolomatin/metalurgi/Sber_parser/4201-44...,""" — поставки металлорежущего инструмента ISCAR...",4201-4401,"[1207700112047, 0]"
3,"6 СОТОК, ООО СК_1187746311697_0",/home/rsolomatin/metalurgi/Sber_parser/1401-16...,"""Заборы в Москве с установкой под ключ | Заказ...",1401-1601,"[1187746311697, 0]"
4,"7 КООРДИНАТ, ООО_1185029003290_0",/home/rsolomatin/metalurgi/Sber_parser/2601-28...,"""Производство металлоизделий и металлоконструк...",2601-2801,"[1185029003290, 0]"
...,...,...,...,...,...
2860,"ЯРКА-СТРОЙ, ООО_1186658075702_0",/home/rsolomatin/metalurgi/Sber_parser/1001-12...,"""Изготовление металлоконструкций от производит...",1001-1201,"[1186658075702, 0]"
2861,"ЯРОСЛАВСКАЯ МЕТАЛЛООБРАБОТКА, ООО_1207600010760_0",/home/rsolomatin/metalurgi/Sber_parser/5801-60...,"""Проверены временем, воодушевлены работой""",5801-6001,"[1207600010760, 0]"
2862,"ЯРОСЛАВСКАЯ ТАРА, ООО_1027600514831_0",/home/rsolomatin/metalurgi/Sber_parser/201-301...,"""ÐÐ°Ð½Ð½ÑÐ¹ Ð´Ð¾Ð¼ÐµÐ½ Ð²Ð¾Ð·Ð¼Ð¾Ð¶Ð½Ð¾ Ð¿Ñ...",201-301,"[1027600514831, 0]"
2863,"ЯРСТИЛЬМЕТАЛЛ, ООО_1217600004797_0",/home/rsolomatin/metalurgi/Sber_parser/1201-14...,"""Опоры мебельные от производителя - Ножки от А...",1201-1401,"[1217600004797, 0]"


In [93]:
bad_companies.shape, good_companies.shape

((2865, 5), (2865, 5))

In [56]:
pairs = []
for _, good_comp in tqdm(good_companies.iterrows(), total=good_companies.shape[0]):
    for _, bad_comp in bad_companies.iterrows():
        if (
            good_comp["line"] == bad_comp["line"]
            and good_comp["folder"] == bad_comp["folder"]
        ):
            pairs.append((good_comp["company"], bad_comp["company"]))
            if (
                good_comp["company"] == bad_comp["company"]
                or good_comp["id"] == bad_comp["id"]
            ):
                break

100%|██████████| 2865/2865 [05:01<00:00,  9.50it/s]


In [57]:
pairs = pd.DataFrame(pairs)

In [89]:
pairs.to_csv("pairs.csv")

In [74]:
pairs["good"].nunique(), pairs["bad"].nunique()

(2864, 2864)

In [4]:
pairs = pd.read_csv("pairs.csv", usecols=["good", "bad"])

In [72]:
pairs.groupby("good").filter(lambda x: x["bad"].count() > 1)["good"].unique()

array([], dtype=object)

In [115]:
for _, good in tqdm(good_companies.iterrows(), total=good_companies.shape[0]):
    if not good["company"] in pairs["good"].values:
        print(good)

100%|██████████| 2865/2865 [00:00<00:00, 7059.96it/s]

company    ЭЙДЖЕС. САНКТ-ПЕТЕРБУРГСКАЯ ИСТОРИЧЕСКАЯ КОЛЛЕ...
path       /home/rsolomatin/metalurgi/Sber_parser/2201-24...
line                           "Оловянные солдатики —  AGES"
folder                                             2201-2401
id                                        [5067847018553, 0]
Name: 2779, dtype: object





In [113]:
for _, bad in tqdm(bad_companies.iterrows(), total=bad_companies.shape[0]):
    if not bad["company"] in pairs["bad"].values:
        print(bad)

 83%|████████▎ | 2374/2865 [00:00<00:00, 7985.89it/s]

company                    ДОН-КОМПЛЕКТ, ООО_1106193003300_0
path       /home/rsolomatin/metalurgi/Sber_parser/3401-36...
line                       "Чугунный патрубок  конец ПФГ - "
folder                                             3401-3601
id                                        [1106193003300, 0]
Name: 303, dtype: object


100%|██████████| 2865/2865 [00:00<00:00, 7941.58it/s]


In [17]:
df["company"] = df["Путь"].str.split("/").str[-1]
df["folder"] = df["Путь"].str.split("/").str[1]

In [19]:
pairs["inn"] = np.nan
for index, row in tqdm(pairs.iterrows(), total=pairs.shape[0]):
    pairs.loc[index, "inn"] = df[df["company"] == row["good"]]["ИНН"].values[0]
    pairs.loc[
        index, "old_path"
    ] = f'/home/rsolomatin/metalurgi/Sber_parser/{df[df["company"]==row["good"]]["folder"].values[0]}/data/{row["bad"]}'

100%|██████████| 2864/2864 [00:04<00:00, 597.69it/s]


In [25]:
pairs.to_csv("pairs_with_inn.csv")