In [3]:
import io

from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage

# from textblob import TextBlob

from nltk import word_tokenize

from nltk.corpus import stopwords

from pymystem3 import Mystem  # lemmatizing from yandex
import spacy
import es_core_news_sm
import en_core_web_sm
import fr_core_news_sm

import requests, matplotx
import pandas as pd
import matplotlib.pyplot as plt

import razdel

import string

import math as m

import re
import seaborn as sns
import numpy as np

from scipy import spatial
from matplotlib import pyplot as plt

import razdel
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


# Предобработка текста

In [4]:
spec_chars = string.punctuation + '\n\xa0«»\t—'

In [5]:
def extract_text_from_pdf(pdf_path):
    """
    Функция считывающая текст из pdf-файла в строку python
    
    Ввод: путь до файла на компьютере
    Вывод: строка python
    """
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)

    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh,
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)

        text = fake_file_handle.getvalue()

    converter.close()
    fake_file_handle.close()

    if text:
        return text
    
    
def remove_chars_from_text(text, chars):
    return "".join([ch for ch in text if ch not in chars])




def count_syllables(text, lang):
    if lang == 'rus':
        literas = 'аоиыуэ'
    elif lang == 'spa':
        literas = 'aеiоuáíóúé'
    elif lang == 'eng':
        literas = 'aeiou'
    elif lang == 'fra':
        literas = 'aeiouáíóúé'
    else:
        raise ValueError
        
    count = 0
    for i in text:
        if i in literas:
            count += 1
    return count


def del_stopwords(lang, text_tokens):
    if lang == 'rus':
        lang_stopwords = stopwords.words("russian")
    elif lang == 'spa':
        lang_stopwords = stopwords.words("spanish")
    elif lang == 'eng':
        lang_stopwords = stopwords.words("english")
    elif lang == 'fra':
        lang_stopwords = stopwords.words('french')
    else:
        raise ValueError

    filtered_tokens = []

    for token in text_tokens:
        if token not in lang_stopwords:
            filtered_tokens.append(token)

    #     print(filtered_tokens)
    return (filtered_tokens)


def lemmatize(lang, text):
    if lang == 'rus':
        m = Mystem()
        lemmas = m.lemmatize(text)
        lemmatize_str = "".join(lemmas).strip()
    elif lang == 'spa':
        nlp = es_core_news_sm.load()
        list = []
        for token in nlp(text):
            list.append(token.lemma_)
        lemmatize_str = ' '.join(list)
    elif lang == 'eng':
        nlp = en_core_web_sm.load()
        list = []
        for token in nlp(text):
            list.append(token.lemma_)
        lemmatize_str = ' '.join(list)
    elif lang == 'fra':
        nlp = fr_core_news_sm.load()
        list = []
        for token in nlp(text):
            list.append(token.lemma_)
        lemmatize_str = ' '.join(list)
    return lemmatize_str




In [6]:
def text_preprocessing(path, lang):
    
    text = extract_text_from_pdf(path) # Прочитали текст из PDF
    sent_text = list(x.text for x in razdel.sentenize(text)) # Разделили строку на массив строк - предложений
    
    num_of_sent = len(sent_text)
    print("Number of sentences: ",  num_of_sent)
    
    text = remove_chars_from_text(text, spec_chars) # Удалили из текста специальные символы
    text_without_spaces = remove_chars_from_text(text, ' ') # Удалили из текста пробелы
    num_of_symbols = len(text_without_spaces)
    print("Number of symbols: ",  num_of_symbols)
    
    
    text = text.lower() # Привели текст к нижнему регистру
    
    num_of_syllables = count_syllables(text_without_spaces, lang) # Количество слогов во всем тексте
    print("Number of syllables: ",  num_of_syllables)
    
    text_tokens = word_tokenize(text) # Разделили строку на массив строк - слов
    count_tokens_with_stopwords = len(text_tokens) # Количество слов во всем тексте
    print("Number of tokens with stopword: ",  count_tokens_with_stopwords)
    
    count_words_w_3_syllables = 0
    for word in text_tokens:
        w_cnt_syl_3 = count_syllables(str(word), lang)
        if w_cnt_syl_3 >= 3:
            count_words_w_3_syllables += 1
            
    print("Number of tokens with 3 syllables: ",  count_words_w_3_syllables) # Количество слов с 3 слогами или больше
    
    filtered_tokens = del_stopwords(lang, text_tokens) # Удалили из массива слов стоп-слова
    count_tokens = len(filtered_tokens)
    print("Number of tokens without stopword: ",  count_tokens)
    
    filtered_string = ' '.join(filtered_tokens) # Преобразовали список в строку
    
    lemmatize_str = lemmatize(lang, filtered_string) # Привели слова к леммам
    lemmatize_tokens = lemmatize_str.split() 
    
    unique_tokens = list(set(lemmatize_tokens))
    unique_tokens_with_stopwords = list(set(text_tokens))
    count_unique_tokens_with_stopwords = len(unique_tokens_with_stopwords)
    count_unique_tokens = len(unique_tokens)
    print("Number of unique tokens without stopword: ",  count_unique_tokens)
    print("Number of unique tokens with stopword: ",  count_unique_tokens_with_stopwords)
    
    res = {
        'num_of_sent' : num_of_sent,
        'num_of_symbols' : num_of_symbols,
        'num_of_syllables' : num_of_syllables,
        'count_tokens_with_stopwords' : count_tokens_with_stopwords,
        'count_words_w_3_syllables' : count_words_w_3_syllables,
        'count_tokens' : count_tokens,
        'count_unique_tokens' : count_unique_tokens,
        'count_unique_tokens_with_stopwords' : count_unique_tokens_with_stopwords,
        'lemmatize_tokens' : lemmatize_tokens,
        'unique_tokens' : unique_tokens,
        'text_tokens' : text_tokens
    }

    return res
    

In [7]:
link = "C:/Users/smile/PycharmProjects/VKR/necessary_docs/Непрерывность_парков/5_Непрерывность_парков.pdf" 
lang = "rus"
year = 1959

res = text_preprocessing(link, lang)

Number of sentences:  27
Number of symbols:  2477
Number of syllables:  746
Number of tokens with stopword:  459
Number of tokens with 3 syllables:  103
Number of tokens without stopword:  307
Number of unique tokens without stopword:  237
Number of unique tokens with stopword:  326


# Подсчет частотности

In [224]:
lemmatize_tokens_dict = {}
for word in res['lemmatize_tokens']:
        if word in lemmatize_tokens_dict:
            lemmatize_tokens_dict[word] = lemmatize_tokens_dict[word] + 1
        else:
            lemmatize_tokens_dict[word] = 1

In [225]:
def batch(iterable, n=50):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]


In [226]:
def do_dict_freq(unique_tokens, lang, year):
    inf_unique_tokens = [x + '_INF' for x in unique_tokens]
    list_batches = batch(inf_unique_tokens)
    num_batches = (len(list(list_batches)))
    list_batches = batch(inf_unique_tokens)
    cnt = 0
    dict_freq = {}
    for item in list_batches:
        cnt += 1
        print('Batch', cnt, 'of',  num_batches)
        inf_unique_tokens_str = ','.join(item)

        if lang == 'rus':
            params = {
                "content": inf_unique_tokens_str,
                "year_start": str(year - 1),
                "year_end": str(year),
                "corpus": "ru-2019"
            }
        elif lang == 'spa':
            params = {
                "content": inf_unique_tokens_str,
                "year_start": str(year - 1),
                "year_end": str(year),
                "corpus": "es-2019"
            }
        elif lang == 'eng':
            params = {
                "content": inf_unique_tokens_str,
                "year_start": str(year - 1),
                "year_end": str(year),
                "corpus": "en-2019"
            }
        elif lang == 'fra':
            params = {
                "content": inf_unique_tokens_str,
                "year_start": str(year - 1),
                "year_end": str(year),
                "corpus": "fr-2019"
            }

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.125 Safari/537.36",
        }

        r = requests.get("https://books.google.com/ngrams/json", params=params, headers=headers, timeout=1000)
        html = r.text
        time_series = pd.read_json(html, typ="series")

        for i in time_series:
            if i['type'] == 'EXPANSION' and len(i['timeseries']) == 2:
                if i['parent'][:int(i['parent'].find("_"))] not in dict_freq:
                    dict_freq[i['parent'][:int(i['parent'].find("_"))]] = [i['timeseries'][1]*100, 1]
                else:
                    dict_freq[i['parent'][:int(i['parent'].find("_"))]][0] = dict_freq[i['parent'][:int(i['parent'].find("_"))]][0] \
                                                                            + i['timeseries'][1]*100
                    dict_freq[i['parent'][:int(i['parent'].find("_"))]][1] = dict_freq[i['parent'][:int(i['parent'].find("_"))]][1] + 1


    return dict_freq


In [227]:
dict_freq = do_dict_freq(res['unique_tokens'], lang, year)

Batch 1 of 5
Batch 2 of 5
Batch 3 of 5
Batch 4 of 5
Batch 5 of 5


In [228]:
words_calculated_freq = {}
for i in dict_freq:
    words_calculated_freq[i] = dict_freq[i][0]/dict_freq[i][1]
    

def most_least_common(words_calculated_freq, n=10):
    sorted_freq = dict(sorted(words_calculated_freq.items(), key=lambda item: item[1]))
    from collections import Counter
    d = Counter(words_calculated_freq)


    sum1 = 0
    for k, v in d.most_common(n):
        sum1 += v
        
    most_common = d.most_common(n)
    least_common = d.most_common()[:-n-1:-1]
    
    return most_common, least_common

most_common, least_common = most_least_common(words_calculated_freq)

df_most_common = pd.DataFrame(most_common, columns =['Word', 'Frequency'])
display(df_most_common)

df_least_common = pd.DataFrame(least_common, columns =['Word', 'Frequency'])
display(df_least_common)

Unnamed: 0,Word,Frequency
0,женщина,0.014834
1,северный,0.012934
2,кресло,0.009383
3,весь,0.007758
4,другой,0.007524
5,входить,0.00723
6,самый,0.006995
7,щека,0.00679
8,слово,0.006776
9,человек,0.006611


Unnamed: 0,Word,Frequency
0,строка,6.378881e-08
1,мир,1.080981e-07
2,кто,1.107733e-07
3,голова,1.439822e-07
4,мужчина,1.454684e-07
5,дверь,2.981213e-07
6,дело,2.989573e-07
7,жажда,3.071346e-07
8,смысл,3.728413e-07
9,лестница,3.95586e-07


In [229]:
sum = 0
n = 0
for i in words_calculated_freq:
    sum = sum + lemmatize_tokens_dict[i] * words_calculated_freq[i]
    n = n + lemmatize_tokens_dict[i]

avg_freq = sum / n
print('The average frequency of use in the corpus: {:.10f} %'.format(avg_freq)) 

The average frequency of use in the corpus: 0.0012439060 %


# Расчет статистических характеристик

In [8]:
num_of_sent = res['num_of_sent']
num_of_symbols = res['num_of_symbols']
num_of_syllables = res['num_of_syllables']
count_tokens_with_stopwords = res['count_tokens_with_stopwords']
count_words_w_3_syllables = res['count_words_w_3_syllables']
count_tokens = res['count_tokens']
count_unique_tokens = res['count_unique_tokens']
count_unique_tokens_with_stopwords = res['count_unique_tokens_with_stopwords']
lemmatize_tokens = res['lemmatize_tokens']
unique_tokens = res['unique_tokens']
text_tokens = res['text_tokens']

In [10]:
Lp = count_tokens_with_stopwords / num_of_sent
print("Average sentence length:", round(Lp, 2))
Lpp = num_of_symbols / count_tokens_with_stopwords
print("Average word length:", round(Lpp, 2))
Pp = (count_tokens_with_stopwords - count_tokens) / count_tokens_with_stopwords
print("Frequency of use of stop words:", round(Pp, 2))
TTR = count_unique_tokens_with_stopwords / count_tokens_with_stopwords
print("Lexical diversity (Type-Token Ratio):", round(TTR, 2))
R =  count_unique_tokens_with_stopwords / m.sqrt(count_tokens_with_stopwords)
print("Lexical diversity (Guiraud's Root TTR):", round(R, 2))
U = (m.log(count_tokens_with_stopwords))**2 / (m.log(count_tokens_with_stopwords) - m.log(count_unique_tokens_with_stopwords))
print("Lexical diversity (Dugast's Uber Index):", round(U, 2))

Average sentence length: 17.0
Average word length: 5.4
Frequency of use of stop words: 0.33
Lexical diversity (Type-Token Ratio): 0.71
Lexical diversity (Guiraud's Root TTR): 15.22
Lexical diversity (Dugast's Uber Index): 109.79


In [11]:
result = {}    
for word in text_tokens:
    result[word] = result[word] + 1 if word in result else 1

df_res = pd.DataFrame(result.items(), columns=['word', 'count'])
a_df = df_res.groupby(['count']).count()
a_df = a_df.reset_index()
# display(a_df)
# print(a_df.shape)
k_sum = 0
for i in range(a_df.shape[0]):
#     print(a_df.loc[i, 'count'])
    k_sum = k_sum + int(a_df.loc[i, 'word']) * ((int(a_df.loc[i, 'count'])/count_tokens_with_stopwords))**2
    
K = 10**4 *(-1/count_tokens_with_stopwords+k_sum)
print("Lexical diversity (Yule's K):", round(K, 2))

Lexical diversity (Yule's K): 59.05


In [12]:
FK = 0.39 * (count_tokens_with_stopwords/num_of_sent) + 11.8 * (num_of_syllables / count_tokens_with_stopwords) - 15.59
print("Readability Index (Flesch–Kincaid):", round(FK, 2))
G = 0.4 * ((count_tokens_with_stopwords/num_of_sent) + 100 * ( count_words_w_3_syllables / count_tokens_with_stopwords))
print("Readability Index (Gunning Fog):", round(G, 2))

Readability Index (Flesch–Kincaid): 10.22
Readability Index (Gunning Fog): 15.78


# Растчет коэффициента семантической схожести текстов

In [87]:
model_st = SentenceTransformer('distiluse-base-multilingual-cased')

In [171]:
def extract_text_from_pdf(pdf_path):
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        converter = TextConverter(resource_manager, fake_file_handle)
        page_interpreter = PDFPageInterpreter(resource_manager, converter)

        with open(pdf_path, 'rb') as fh:
            for page in PDFPage.get_pages(fh,
                                          caching=True,
                                          check_extractable=True):
                page_interpreter.process_page(page)

            text = fake_file_handle.getvalue()

        converter.close()
        fake_file_handle.close()

        if text:
            return text
        
def clean_text(text):
        new_text = re.sub('\n', ' ', text)
        new_text = re.sub(" +", " ", new_text)
        sent = list(x.text for x in razdel.sentenize(new_text))
        return sent
    
    
def get_batch(iter1, iter2, batch_size):
        l1 = len(iter1)
        l2 = len(iter2)
        k = int(round(batch_size * l2/l1))    
        kdx = 0 - k
        for ndx in range(0, l1, batch_size):
            kdx += k
            yield iter1[ndx:min(ndx + batch_size, l1)], iter2[kdx:min(kdx + k, l2)]
            
            
def get_sim_matrix(vec1, vec2, window=10):
    sim_matrix=np.zeros((len(vec1), len(vec2)))
    k = len(vec1)/len(vec2)
    for i in range(len(vec1)):
        for j in range(len(vec2)):
            if (j*k > i-window) & (j*k < i+window):
              sim = 1 - spatial.distance.cosine(vec1[i], vec2[j])
              sim_matrix[i,j] = sim
    return sim_matrix
    
def get_pairs(ru_lines, de_lines, sim_matrix, threshold):
        ru = []
        de = []
        sims = []
        for i in range(sim_matrix.shape[0]):
            for j in range(sim_matrix.shape[1]):
                if sim_matrix[i,j] >= threshold:
                    ru.append(ru_lines[i])
                    de.append(de_lines[j])
                    sims.append(sim_matrix[i,j])                
        return ru, de, sims
    
    


In [172]:
def calculate_sim(path1, path2):
    text1 = extract_text_from_pdf(path1)
    text2 = extract_text_from_pdf(path2)
    
    sent1 = clean_text(text1)
    sent2 = clean_text(text2)
    
    
    batch_number = 0
    total_pairs = 0
    batch_size = 50
    window = 10
    threshold = 0.3
    
    vectors1, vectors2 = [], []
    
    for lines1_batch, lines2_batch in get_batch(sent1, sent2, batch_size):
        batch_number += 1
        vectors1 = [*vectors1, *model_st.encode(lines1_batch)]
        vectors2 = [*vectors2, *model_st.encode(lines2_batch)]
        
    sim_matrix = get_sim_matrix(vectors1, vectors2, window)    
    sim_matrix_best = np.zeros_like(sim_matrix)
    sim_matrix_best[range(len(sim_matrix)), sim_matrix.argmax(1)] = sim_matrix[range(len(sim_matrix)), sim_matrix.argmax(1)]
    

    res1, res2, sims = get_pairs(sent1, sent2, sim_matrix_best, threshold)
    sim_sum = 0
    counter = 0
    for x, y, s in zip(res1, res2, sims):
        counter += 1
        sim_sum += s
        
    similarity = round(sim_sum/counter, 2)
    return similarity

In [173]:
paths = [
    "C:/Users/smile/PycharmProjects/VKR/necessary_docs/Непрерывность_парков/1_Continuidad_de_los_parques.pdf",
    "C:/Users/smile/PycharmProjects/VKR/necessary_docs/Непрерывность_парков/2_Continuité-des-Parcs.pdf",
    "C:/Users/smile/PycharmProjects/VKR/necessary_docs/Непрерывность_парков/3_the-continuity-of-parks.pdf",
    "C:/Users/smile/PycharmProjects/VKR/necessary_docs/Непрерывность_парков/4_Непрерывность_парков.pdf",
    "C:/Users/smile/PycharmProjects/VKR/necessary_docs/Непрерывность_парков/5_Непрерывность_парков.pdf"    
]

In [174]:
res_matrix = []
for p1 in paths:
    res_list = []
    for p2 in paths:
        res_list.append(calculate_sim(p1, p2))
    res_matrix.append(res_list) 

        

In [130]:
res_matrix

[[1.0, 0.79, 0.85, 0.78, 0.71],
 [0.71, 1.0, 0.69, 0.66, 0.59],
 [0.85, 0.76, 1.0, 0.77, 0.69],
 [0.78, 0.72, 0.77, 1.0, 0.77],
 [0.7, 0.65, 0.69, 0.78, 1.0]]

In [133]:
for i in range(len(res_matrix)):
    for j in range(len(res_matrix)):
        res_matrix[i][j] = res_matrix[j][i] = max(res_matrix[i][j], res_matrix[j][i])

In [134]:
res_matrix

[[1.0, 0.79, 0.85, 0.78, 0.71],
 [0.79, 1.0, 0.76, 0.72, 0.65],
 [0.85, 0.76, 1.0, 0.77, 0.69],
 [0.78, 0.72, 0.77, 1.0, 0.78],
 [0.71, 0.65, 0.69, 0.78, 1.0]]

# Корреляционный анализ

In [13]:
data = {'Lp': [1.7, 3.73, 2.39, 2.46, 5.43, 0.67, 0.76, 6.12, 6.19, 0.07],
       'Lpp': [0.35, 0.32, 0.63, 0.63, 0.03, 0.98, 0.98, 0.95, 0.95, 0],
       'Pp': [0.04, 0.02, 0.17, 0.16, 0.06, 0.13, 0.12, 0.19, 0.18, 0.01],
       'TTR': [0.04, 0.02, 0.23, 0.27, 0.02, 0.27, 0.23, 0.25, 0.21, 0.04],
       'R': [0.13, 0.47, 3.95, 3.1, 0.34, 3.82, 2.97, 3.48, 2.63, 0.85],
       'U': [3.57, 0.2, 69.2, 49.21, 3.37, 72.77, 52.78, 69.4, 49.41, 19.99],
       'K': [11.79, 55.59, 64.94, 60.47, 67.38, 53.15, 60.47, 120.53, 116.03, 4.47],
       'FK': [6.89, 6.66, 4.55, 3.86, 0.23, 2.34, 3.03, 2.11, 2.8, 0.69],
       'G': [6.77, 4.06, 3.22, 3.15, 2.71, 3.55, 3.62, 0.84, 0.91, 0.07],
       'F': [0.00291, 0.00508, 0.00086, 0.00059, 0.00217, 0.00214, 0.00232, 0.00422, 0.00449, 0.00027],
       'SIM': [0.79, 0.85, 0.78, 0.71, 0.76, 0.72, 0.65, 0.77, 0.69, 0.78]}

df = pd.DataFrame(data, columns = ['Lp', 'Lpp', 'Pp', 'TTR', 'R', 'U', 'K', 'FK', 'G', 'F', 'SIM'], 
                 index = ['1-2', '1-3', '1-4', '1-5', '2-3', '2-4', '2-5', '3-4', '3-5', '4-5'])

df

Unnamed: 0,Lp,Lpp,Pp,TTR,R,U,K,FK,G,F,SIM
1-2,1.7,0.35,0.04,0.04,0.13,3.57,11.79,6.89,6.77,0.00291,0.79
1-3,3.73,0.32,0.02,0.02,0.47,0.2,55.59,6.66,4.06,0.00508,0.85
1-4,2.39,0.63,0.17,0.23,3.95,69.2,64.94,4.55,3.22,0.00086,0.78
1-5,2.46,0.63,0.16,0.27,3.1,49.21,60.47,3.86,3.15,0.00059,0.71
2-3,5.43,0.03,0.06,0.02,0.34,3.37,67.38,0.23,2.71,0.00217,0.76
2-4,0.67,0.98,0.13,0.27,3.82,72.77,53.15,2.34,3.55,0.00214,0.72
2-5,0.76,0.98,0.12,0.23,2.97,52.78,60.47,3.03,3.62,0.00232,0.65
3-4,6.12,0.95,0.19,0.25,3.48,69.4,120.53,2.11,0.84,0.00422,0.77
3-5,6.19,0.95,0.18,0.21,2.63,49.41,116.03,2.8,0.91,0.00449,0.69
4-5,0.07,0.0,0.01,0.04,0.85,19.99,4.47,0.69,0.07,0.00027,0.78


In [14]:
list_idx = ['Lp', 'Lpp', 'Pp', 'TTR', 'R', 'U', 'K', 'FK', 'G', 'F', 'SIM']

In [234]:
df.columns

Index(['Lp', 'Lpp', 'Pp', 'TTR', 'R', 'U', 'K', 'FK', 'G', 'F', 'SIM'], dtype='object')

In [16]:
corr_df = pd.DataFrame()
for i in df.columns:
    list_corr = []
    for j in df.columns:
        list_corr.append(df[i].corr(df[j]))
#     print(i)
#     display(pd.DataFrame(list_corr, index = list_idx ))
    corr_df[i] = pd.DataFrame(list_corr, index = list_idx )
    
display(corr_df)

Unnamed: 0,Lp,Lpp,Pp,TTR,R,U,K,FK,G,F,SIM
Lp,1.0,0.11457,0.372758,-0.0046,-0.026132,-0.030518,0.808243,-0.119856,-0.314134,0.635022,0.104984
Lpp,0.11457,1.0,0.81608,0.878382,0.824821,0.827391,0.627353,0.060906,-0.054355,0.287979,-0.621243
Pp,0.372758,0.81608,1.0,0.905508,0.881746,0.868975,0.775619,-0.13465,-0.277395,0.050292,-0.546212
TTR,-0.0046,0.878382,0.905508,1.0,0.966775,0.951001,0.536449,-0.117982,-0.192045,-0.129891,-0.647046
R,-0.026132,0.824821,0.881746,0.966775,1.0,0.987074,0.522675,-0.155043,-0.258362,-0.168986,-0.528606
U,-0.030518,0.827391,0.868975,0.951001,0.987074,1.0,0.513216,-0.224386,-0.314349,-0.160372,-0.532731
K,0.808243,0.627353,0.775619,0.536449,0.522675,0.513216,1.0,-0.205493,-0.425754,0.535288,-0.3028
FK,-0.119856,0.060906,-0.13465,-0.117982,-0.155043,-0.224386,-0.205493,1.0,0.743585,0.332144,0.385538
G,-0.314134,-0.054355,-0.277395,-0.192045,-0.258362,-0.314349,-0.425754,0.743585,1.0,0.071376,0.169431
F,0.635022,0.287979,0.050292,-0.129891,-0.168986,-0.160372,0.535288,0.332144,0.071376,1.0,0.201004


In [17]:
! pip freeze

absl-py==1.4.0
altgraph==0.17.3
anyio==3.6.2
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
arrow==1.2.3
asttokens==2.2.1
astunparse==1.6.3
attrs==22.2.0
auto-py-to-exe==2.27.0
backcall==0.2.0
beautifulsoup4==4.11.1
bert-for-tf2==0.14.9
bleach==5.0.1
blis==0.7.9
bottle==0.12.23
bottle-websocket==0.2.9
cachetools==5.3.0
catalogue==2.0.8
certifi==2022.12.7
cffi==1.15.1
charset-normalizer==2.1.1
click==8.1.3
colorama==0.4.6
comm==0.1.2
confection==0.0.4
contourpy==1.0.6
cycler==0.11.0
cymem==2.0.7
debugpy==1.6.5
decorator==5.1.1
defusedxml==0.7.1
Eel==0.14.0
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl
entrypoints==0.4
es-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.4.0/es_core_news_sm-3.4.0-py3-none-any.whl
et-xmlfile==1.1.0
executing==1.2.0
fastjsonschema==2.16.2
filelock==3.11.0
flatbuffers==23.3.3
fonttools==4.38.0
fqdn==1.5.1
fr-core-news-sm @ h