In [29]:
import pandas as pd
import numpy as np
import difflib as dl
from spacy.lang.ru.stop_words import STOP_WORDS
from spacy.lang.ru import Russian
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import pickle

import warnings
warnings.filterwarnings('ignore')

pd.set_option('max_colwidth', None)
lang = Russian()
tqdm.pandas()

In [7]:
df = pd.read_excel('all_df.xlsx', engine='openpyxl')

### 1. Сходство по целым строкам

In [9]:
df['String_similarity'] = df[['Отрасль', 'наименование деятельности']].apply(lambda x: dl.SequenceMatcher(None, *x).ratio(), axis=1)

### 2. Максимальное сходство по словам

In [10]:
def normalize(text):
    row = lang(text)
    filtered = []
    for word in row:
        if word.is_stop==False and word.is_punct==False:
            filtered.append(str(word).lower())
    return filtered

In [11]:
def similarity(s1, s2):
    similar_words_ratio = []
    max_ratio = []
    for wordS2 in normalize(s2):
        ratio_for_wordS2 = []
        for wordS1 in normalize(s1):
            matcher = dl.SequenceMatcher(None, wordS1, wordS2)
            ratio_for_wordS2.append(matcher.ratio())
        
        similar_words_ratio.append(max(ratio_for_wordS2))
    return similar_words_ratio

In [12]:
df['similarity'] = df[['Отрасль', 'наименование деятельности']].apply(lambda x: similarity(*x), axis=1)

In [13]:
max(df['similarity'][581934])

0.8888888888888888

In [14]:
df['Max_similarity'] = df['similarity'].apply(max)

### 3. Среднее сходство по словам 

In [16]:
df['Average_similarity'] = df['similarity'].apply(np.average)

### 4. Количество слов, сходство которых больше среднего

In [17]:
def more_then_average(sim):
    more = []
    for ratio in sim:
        if ratio >= np.average(sim):
            more.append(ratio)
    return(len(more))

In [18]:
df['More_then_average'] = df['similarity'].apply(more_then_average)

### 5. Количество слов в первой и второй строке (без стоп-слов)

In [19]:
df['String_1'] = df['Отрасль'].apply(lambda x: len(normalize(x)))
df['String_2'] = df['наименование деятельности'].apply(lambda x: len(normalize(x)))

In [20]:
df = df.drop(['Unnamed: 0','similarity'], axis=1)

### 6. BERT + Cosine similarity

In [21]:
model = SentenceTransformer('../bert_for_sentence', device='cuda')



In [24]:
%%time
df['cosine_similarity'] = df[['Отрасль', 'наименование деятельности']].progress_apply(lambda x: cosine_similarity(model.encode(x[0]).reshape(1, -1), model.encode(x[1]).reshape(1, -1)).squeeze(), axis=1)

100%|██████████| 581935/581935 [3:20:46<00:00, 48.31it/s]   

CPU times: user 3h 3min 33s, sys: 5min 4s, total: 3h 8min 37s
Wall time: 3h 20min 46s





In [46]:
df.sort_values(['predict'], ascending=False).to_excel('final_df.xlsx')

In [26]:
df.to_excel('581_935_features.xlsx')

In [30]:
random_forest = pickle.load(open('RandomForest_model.sav', 'rb'))

In [31]:
lgbm = pickle.load(open('lgbm_model.sav', 'rb'))

In [33]:
df['rf_pred'] = random_forest.predict_proba(df[['String_similarity', 'Max_similarity', 'Average_similarity',
                                'More_then_average', 'String_1', 'String_2', 'cosine_similarity']])[:, 1]

In [41]:
df['cosine_similarity'] = df['cosine_similarity'].astype('float64')

In [42]:
df['predict'] = lgbm.predict_proba(df[['String_similarity', 'Max_similarity', 'Average_similarity',
                                'More_then_average', 'String_1', 'String_2', 'cosine_similarity', 'rf_pred']])[:, 1]