In [1]:
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm import tqdm

import matplotlib.pyplot as plt
from pdb import set_trace

import warnings

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def group_and_get_unique(words_lst):
    full_lst = ', '.join(words_lst)
    all_words = full_lst.split(', ')
    return ' '.join(set(all_words))

In [3]:
e_df = pd.read_csv('../data/esg_words_e.csv')
e_df = e_df.groupby('topic_name').agg({'words': group_and_get_unique})
e_df.head()

Unnamed: 0_level_0,words
topic_name,Unnamed: 1_level_1
E,объективность фабрикация теплоэнергетика биогр...
Биоразнообразие,разграничить ограничение площадь геофизическии...
Вода,речнои единство муниципалитет океан водопровод...
Газ,бензин добыча топливныи газоснабжение таманьне...
Климат,казахстан разграничить рисковои карелэнерго ха...


In [5]:
s_df = pd.read_csv('../data/esg_words_s.csv')
s_df = s_df.groupby('topic_name').agg({'words': group_and_get_unique})
s_df.head()

Unnamed: 0_level_0,words
topic_name,Unnamed: 1_level_1
Безопасность и охрана здоровья,рисковои инженерия эколог страховщик импортиро...
Безопасность продукта,гибкость рисковои кредитныи характеризовать фу...
Благотворительность,лига олимпиискии реконструкция монастырь образ...
Инвестиции и капитальные вложения,связанныи реконструкция повторяться критически...
Налоги,туркменистан таманьнефтегаз норматив налогообл...


In [6]:
g_df = pd.read_csv('../data/esg_words_g.csv')
g_df = g_df.groupby('topic_name').agg({'words': group_and_get_unique})
g_df.head()

Unnamed: 0_level_0,words
topic_name,Unnamed: 1_level_1
Антикоррупция,межрегиональныи разграничить управленческии по...
Дивиденды и акционеры,мосбиржа столица гибкость назначать управленче...
Инновации,инновационность контент привилегия лицензирова...
Лидерство,повторяться критическии неядерныи сорт дикии п...
Отчетность и прозрачность,рисковои повторяться проверка стена журналист ...


In [7]:
reports_df = pd.read_csv('../data/report_pages_ds.csv', index_col=0)
reports_df = reports_df.sort_values(by=['report_num', 'page_num']).reset_index(drop=True)
reports_df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/report_pages_ds.csv'

In [14]:
len(reports_df.report_num.unique())

1187

In [15]:
def join(series):
    return ' '.join(series)

In [16]:
reports_df = reports_df.drop(['page_num'], axis=1).groupby('report_num').agg({'text': join}).reset_index()
reports_df.head()

Unnamed: 0,report_num,text
0,1,информация утверждение утвердить годовой собра...
1,2,энергия регион о настоящий годовой пао россеть...
2,3,зелёный свет зелёный энергетика интегрировать ...
3,4,содержание годовой о содержание настоящий инте...
4,5,надёжность доступность эффективность утвердить...


In [17]:
pages_words = reports_df['text'].tolist()
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(pages_words)

<1187x8978 sparse matrix of type '<class 'numpy.float64'>'
	with 3007796 stored elements in Compressed Sparse Row format>

In [18]:
def compute_cosine_similarity(text1, text2, vectorizer=vectorizer):
    # converts text into vectors with the TF-IDF 
    tfidf_text1 = vectorizer.transform([text1])
    tfidf_text2 = vectorizer.transform([text2])
    
    # computes the cosine similarity
    cs_score = cosine_similarity(tfidf_text1, tfidf_text2)
    
    return np.round(cs_score[0][0], 2)

In [25]:
e_rating = defaultdict(int)
for _, report in reports_df.iterrows():
    for topic_name, secondary_topic in e_df.iterrows():
        res = compute_cosine_similarity(report.text, secondary_topic.words)
        e_rating[(topic_name, report.report_num)] += res

In [26]:
s_rating = defaultdict(int)
for _, report in reports_df.iterrows():
    for topic_name, secondary_topic in s_df.iterrows():
        res = compute_cosine_similarity(report.text, secondary_topic.words)
        s_rating[(topic_name, report.report_num)] += res

In [27]:
g_rating = defaultdict(int)
for _, report in reports_df.iterrows():
    for topic_name, secondary_topic in g_df.iterrows():
        res = compute_cosine_similarity(report.text, secondary_topic.words)
        g_rating[(topic_name, report.report_num)] += res

In [28]:
reports_info = pd.read_csv('rspp_reports.csv', index_col=0)
reports_info = reports_info[reports_info.index.isin(reports_df.report_num)]
reports_info.head()

Unnamed: 0,компания,сектор,год,тип отчета,ссылка на отчет
1,"ОАО ""МРСК Урала""",Энергетика,2021,ИО,/download/af503e07dd6b861d1ed3048c36868cc9/
2,"ПАО ""Россети Сибирь""",Энергетика,2021,ИО,/download/57becde4be827f45bedf2a46f58d793a/
3,"ПАО ""Россети Юг""",Энергетика,2021,ИО,/download/851fbaea09387885cefa38bd3b6838f3/
4,ПАО «Россети Ленэнерго»,Энергетика,2021,ИО,/download/2034b8c8e84e4bba78049bdd976fc122/
5,ПАО «Россети Кубань»,Энергетика,2021,ИО,/download/9462e171afc761c3f098fce4f993e2a8/


In [29]:
def get_e_topic_value(report_id, topic_name):
    return e_rating[(topic_name, int(report_id))]

for topic_name, _ in e_df.iterrows():
    reports_info[f'E_{topic_name}'] = reports_info.apply(lambda x: get_e_topic_value(x.name, topic_name), axis=1)
    
reports_info.rename(columns={'E_E': 'E_Общие', 'E_Биоразнообразие ': 'E_Биоразнообразие'}, inplace=True)
reports_info.head()

Unnamed: 0,компания,сектор,год,тип отчета,ссылка на отчет,E_Общие,E_Биоразнообразие,E_Вода,E_Газ,E_Климат,E_Отходы,E_Экологический менеджмент,E_Энергия
1,"ОАО ""МРСК Урала""",Энергетика,2021,ИО,/download/af503e07dd6b861d1ed3048c36868cc9/,0.02,0.01,0.0,0.0,0.02,0.0,0.02,0.03
2,"ПАО ""Россети Сибирь""",Энергетика,2021,ИО,/download/57becde4be827f45bedf2a46f58d793a/,0.01,0.01,0.0,0.0,0.02,0.0,0.01,0.04
3,"ПАО ""Россети Юг""",Энергетика,2021,ИО,/download/851fbaea09387885cefa38bd3b6838f3/,0.01,0.01,0.0,0.0,0.01,0.0,0.01,0.03
4,ПАО «Россети Ленэнерго»,Энергетика,2021,ИО,/download/2034b8c8e84e4bba78049bdd976fc122/,0.02,0.01,0.0,0.0,0.02,0.0,0.02,0.08
5,ПАО «Россети Кубань»,Энергетика,2021,ИО,/download/9462e171afc761c3f098fce4f993e2a8/,0.02,0.01,0.0,0.0,0.02,0.01,0.01,0.04


In [30]:
def get_s_topic_value(report_id, topic_name):
    return s_rating[(topic_name, int(report_id))]

for topic_name, _ in s_df.iterrows():
    reports_info[f'S_{topic_name}'] = reports_info.apply(lambda x: get_s_topic_value(x.name, topic_name), axis=1)
reports_info.rename(columns={}, inplace=True)
reports_info.head()

Unnamed: 0,компания,сектор,год,тип отчета,ссылка на отчет,E_Общие,E_Биоразнообразие,E_Вода,E_Газ,E_Климат,...,S_Благотворительность,S_Инвестиции и капитальные вложения,S_Налоги,S_Обучение и развитие,S_Оплата труда,S_Отношения с потребителями,S_Отношения с работниками,S_Охрана здоровья,S_Профсоюзы и коллективные договоры,S_Трудовые отношения
1,"ОАО ""МРСК Урала""",Энергетика,2021,ИО,/download/af503e07dd6b861d1ed3048c36868cc9/,0.02,0.01,0.0,0.0,0.02,...,0.02,0.05,0.01,0.01,0.03,0.03,0.03,0.0,0.02,0.05
2,"ПАО ""Россети Сибирь""",Энергетика,2021,ИО,/download/57becde4be827f45bedf2a46f58d793a/,0.01,0.01,0.0,0.0,0.02,...,0.02,0.03,0.01,0.01,0.02,0.02,0.02,0.0,0.02,0.03
3,"ПАО ""Россети Юг""",Энергетика,2021,ИО,/download/851fbaea09387885cefa38bd3b6838f3/,0.01,0.01,0.0,0.0,0.01,...,0.02,0.03,0.01,0.01,0.01,0.02,0.01,0.0,0.01,0.03
4,ПАО «Россети Ленэнерго»,Энергетика,2021,ИО,/download/2034b8c8e84e4bba78049bdd976fc122/,0.02,0.01,0.0,0.0,0.02,...,0.02,0.04,0.01,0.01,0.02,0.02,0.02,0.0,0.02,0.04
5,ПАО «Россети Кубань»,Энергетика,2021,ИО,/download/9462e171afc761c3f098fce4f993e2a8/,0.02,0.01,0.0,0.0,0.02,...,0.06,0.05,0.02,0.01,0.02,0.03,0.02,0.0,0.02,0.04


In [31]:
def get_g_topic_value(report_id, topic_name):
    return g_rating[(topic_name, int(report_id))]

for topic_name, _ in g_df.iterrows():
    reports_info[f'G_{topic_name}'] = reports_info.apply(lambda x: get_g_topic_value(x.name, topic_name), axis=1)
    
reports_info.head()

Unnamed: 0,компания,сектор,год,тип отчета,ссылка на отчет,E_Общие,E_Биоразнообразие,E_Вода,E_Газ,E_Климат,...,S_Охрана здоровья,S_Профсоюзы и коллективные договоры,S_Трудовые отношения,G_Антикоррупция,G_Дивиденды и акционеры,G_Инновации,G_Лидерство,G_Отчетность и прозрачность,G_Управление рисками,G_Эффективность и производительность
1,"ОАО ""МРСК Урала""",Энергетика,2021,ИО,/download/af503e07dd6b861d1ed3048c36868cc9/,0.02,0.01,0.0,0.0,0.02,...,0.0,0.02,0.05,0.02,0.06,0.01,0.08,0.05,0.03,0.05
2,"ПАО ""Россети Сибирь""",Энергетика,2021,ИО,/download/57becde4be827f45bedf2a46f58d793a/,0.01,0.01,0.0,0.0,0.02,...,0.0,0.02,0.03,0.01,0.05,0.01,0.07,0.04,0.02,0.03
3,"ПАО ""Россети Юг""",Энергетика,2021,ИО,/download/851fbaea09387885cefa38bd3b6838f3/,0.01,0.01,0.0,0.0,0.01,...,0.0,0.01,0.03,0.01,0.05,0.01,0.06,0.03,0.02,0.03
4,ПАО «Россети Ленэнерго»,Энергетика,2021,ИО,/download/2034b8c8e84e4bba78049bdd976fc122/,0.02,0.01,0.0,0.0,0.02,...,0.0,0.02,0.04,0.02,0.05,0.01,0.06,0.04,0.02,0.04
5,ПАО «Россети Кубань»,Энергетика,2021,ИО,/download/9462e171afc761c3f098fce4f993e2a8/,0.02,0.01,0.0,0.0,0.02,...,0.0,0.02,0.04,0.02,0.06,0.01,0.07,0.05,0.03,0.05


In [32]:
reports_info.to_csv('reports_info_esg2.csv')