In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from typing import Dict
import json
import pickle


def clean_json(json_str: str) -> Dict:
    aska = json_str[::-1]
    edge_inda = - aska.index("}")
    if edge_inda != 0:
        json_str = json_str[:edge_inda] + ']'
    else:
        json_str += ']'
    return json.loads(json_str)


def get_characts(obj):
    characts = set()
    if not pd.isna(obj['Характеристики СТЕ']):
        for charact in clean_json(obj['Характеристики СТЕ']):
            try:
                characts.add((charact['Name'], charact['Value']))
            except KeyError:
                pass
    return characts


def one_based_connected(id: int, topn: int):
    obj = id2obj[id]
    characts = get_characts(obj)
    candidates = {}

    for category in top8[obj['Категория']]:
        category_candidates = categories[category]
        for cand in category_candidates:
            connected_characts = get_characts(id2obj[cand])
            candidates[cand] = (len(characts & connected_characts)) ** (0.5) * 1 / 8

    for category in top5[obj['Категория']]:
        category_candidates = categories[category]
        for cand in category_candidates:
            connected_characts = get_characts(id2obj[cand])
            candidates[cand] = (len(characts & connected_characts)) ** (0.5) * 1 / 5

    for category in top3[obj['Категория']]:
        category_candidates = categories[category]
        for cand in category_candidates:
            connected_characts = get_characts(id2obj[cand])
            candidates[cand] = (len(characts & connected_characts)) ** (0.5) * 1 / 3

    if not pd.isna(obj['Другая продукция в контрактах']) and len(obj['Другая продукция в контрактах'].strip()) > 0:
        st_others = obj['Другая продукция в контрактах']
        st_others = clean_json(st_others)
        for prod in st_others:
            try:
                connected_obj = prod['OtherSkuId']
                connected_characts = get_characts(decoder[connected_obj])
                candidates[connected_obj] = (len(characts & connected_characts)) ** (0.5)
            except KeyError:
                pass

    sorted_candidates = sorted(candidates, key=lambda x: -candidates[x])
    return sorted_candidates[:topn]


with open('/content/drive/MyDrive/tenderhack/id2obj.pickle', 'rb') as handle:
    id2obj = pickle.load(handle)

with open('/content/drive/MyDrive/tenderhack/categories.pickle', 'rb') as handle:
    categories = pickle.load(handle)

with open('/content/drive/MyDrive/tenderhack/connected_keys.pickle', 'rb') as handle:
    connected_keys = pickle.load(handle)

with open('/content/drive/MyDrive/tenderhack/decoder.pickle', 'rb') as handle:
    decoder = pickle.load(handle)

with open('/content/drive/MyDrive/tenderhack/top3.pickle', 'rb') as handle:
    top3 = pickle.load(handle)

with open('/content/drive/MyDrive/tenderhack/top5.pickle', 'rb') as handle:
    top5 = pickle.load(handle)

with open('/content/drive/MyDrive/tenderhack/top8.pickle', 'rb') as handle:
    top8 = pickle.load(handle)

# USAGE EXAMPLE
ids = one_based_connected(34172198, 10)


In [None]:
data = pd.read_excel('/content/drive/MyDrive/tenderhack/dataset_base.xlsx')  # FILEPATH

In [None]:
matrix_keys = data['Идентификатор СТЕ']

In [None]:
matrix_decoder = {matrix_keys[i]: i for i in range(len(matrix_keys))}

In [None]:
real_matr = [[0 for i in range(2000)] for j in range(2000)]
for ind, obj in data.iterrows():
  if not pd.isna(obj['Другая продукция в контрактах']) and len(obj['Другая продукция в контрактах'].strip()) > 0:
        st_others = obj['Другая продукция в контрактах']
        st_others = clean_json(st_others)
        for prod in st_others:
            try:
                connected_obj = prod['OtherSkuId']
                connected_characts = get_characts(decoder[connected_obj])
                try:
                    real_matr[matrix_decoder[obj['Идентификатор СТЕ']]][matrix_decoder[connected_obj]] = 1
                except IndexError:
                  pass
            except KeyError:
                pass

In [None]:
pd.DataFrame(real_matr).to_csv('/content/drive/MyDrive/tenderhack/real_matr.csv')

In [10]:
top3_matr = [[0 for i in range(2000)] for j in range(2000)]
for ind, obj in data.iterrows():
  for category in top3[obj['Категория']]:
        category_candidates = categories[category]
        for cand in category_candidates:
          try:
            top3_matr[matrix_decoder[obj['Идентификатор СТЕ']]][matrix_decoder[cand]] = 1
          except IndexError:
            pass
pd.DataFrame(top3_matr).to_csv('/content/drive/MyDrive/tenderhack/top3_matr.csv')

In [None]:
top5_matr = [[0 for i in range(2000)] for j in range(2000)]
for ind, obj in data.iterrows():
  for category in top5[obj['Категория']]:
        category_candidates = categories[category]
        for cand in category_candidates:
          try:
            top5_matr[matrix_decoder[obj['Идентификатор СТЕ']]][matrix_decoder[cand]] = 1
          except IndexError:
            pass
pd.DataFrame(top5_matr).to_csv('/content/drive/MyDrive/tenderhack/top5_matr.csv')

In [None]:
top8_matr = [[0 for i in range(2000)] for j in range(2000)]
for ind, obj in data.iterrows():
  for category in top8[obj['Категория']]:
        category_candidates = categories[category]
        for cand in category_candidates:
          try:
            top8_matr[matrix_decoder[obj['Идентификатор СТЕ']]][matrix_decoder[cand]] = 1
          except IndexError:
            pass
pd.DataFrame(top8_matr).to_csv('/content/drive/MyDrive/tenderhack/top8_matr.csv')

In [13]:
charact_matr = [[0 for i in range(2000)] for j in range(2000)]
for i in range(2000):
  for j in range(2000):
    i_charact = get_characts(id2obj[matrix_keys[i]])
    j_charact = get_characts(id2obj[matrix_keys[j]])
    charact_matr[i][j] = len(i_charact & j_charact)
    
pd.DataFrame(charact_matr).to_csv('/content/drive/MyDrive/tenderhack/charact_matr.csv')

In [None]:
edges = []
for i in range(2000):
  for j in range(2000):
    edges.append([real_matr[i][j],
                  top3_matr[i][j],
                  top5_matr[i][j],
                  top8_matr[i][j],
                  charact_matr[i][j]])

In [None]:
edges_df = pd.DataFrame(edges, columns = ['real', 'top3', 'top5', 'top8', 'charact'])

In [None]:
edges_df.corr()