### Pulling in All the Data Across Notebooks (version 1: aggregated, all, relevant sentences.csv)

In [56]:
import pandas as pd
import os 

In [55]:
%store -r HC_reltext
%store -r HC_alltext

%store -r IND_reltext
%store -r IND_alltext

%store -r Energy_reltext
%store -r Energy_alltext

%store -r CONSTA_reltext
%store -r CONSTA_alltext

%store -r CONDIS_reltext
%store -r CONDIS_alltext

%store -r IT_reltext
%store -r IT_alltext

%store -r Real_Estate_reltext
%store -r Real_Estate_alltext

%store -r Materials_reltext
%store -r Materials_alltext

%store -r Utilities_reltext
%store -r Utilities_alltext

In [57]:
all_relevant = HC_reltext+IND_reltext+Energy_reltext+CONSTA_reltext+CONDIS_reltext+IT_reltext+Real_Estate_reltext+Materials_reltext+Utilities_reltext

In [58]:
all_text = HC_alltext+IND_alltext+Energy_alltext+CONSTA_alltext+CONDIS_alltext+IT_alltext+Real_Estate_alltext+Materials_alltext+Utilities_alltext

In [96]:
text_dict = {"relevant text": all_relevant, "all text": all_text}
text_df = pd.DataFrame(text_dict)
text_df.to_csv("aggregated_sentences.csv", encoding = 'utf-8-sig')


In [98]:
pd.DataFrame(all_relevant).to_csv("relevant_sentences.csv", encoding = 'utf-8-sig')

In [99]:
pd.DataFrame(all_text).to_csv("all_sentences.csv", encoding = 'utf-8-sig')

### Statistics of Aggregated Data

In [45]:
print("Total number of documents:", len(all_relevant))

Total number of documents: 72


In [61]:
# Order of relevant text and all_text from Healthcare to Utilities

Healthcare = ["EliLilly", "UnitedHealthGroup", "Merck", "BristolMyersSquibb", "Danaher", "Johnson&Johnson", 
             "Pfizer", "Abbott", "ThermosFisherSc", "Amgen"]

IND = ["Capterpillar", "Lockheed", "Boeing", "UPS", "Raytheon", "Delta", "Deere", "Honeywell", "3M", 
       "UnionPacific"]

Energy = ["Total", "BP", "Shell"]

CONSTA = ["Mondelez_Intl", "Hershey", "Philip_Morris_Intl", "PepsiCo", "Altria_Environmental", 
          "PandG", "Altria_TCFD", "Costco", "CocaCola", "Altria_2021", "Walmart", "EsteeLauder"]

CONDIS = ["McDonalds", "TJX", "Home Depot", "Lowes", "Target", "BookingHoldings", "Tesla",
     "Amazon", "Nike", "Starbucks"]

IT = ["Accenture", "Broadcom", "Mastercard", "Nvidia", "Oracle", "Cisco", "Microsoft", "Visa", "Adobe", "Apple"]

RealEstate = ["AmericanTowerCorp", "Prologis", "CrownCastle"]

Materials = ["Linde", "Ecolab", "SherwinWilliams", "IntrntlFlavorsandFragrances", "FreeportMcmoran", "Nucor", 
             "AirProducts", "Corteva", "NewmontMining", "Dow"]

Utilities = ["Dominion_Energy", "Duke_Energy", "AEP","NextEra"]

In [62]:
all_companies = Healthcare + IND + Energy + CONSTA + CONDIS + IT + RealEstate + Materials + Utilities

report_rel_dict = {}
report_all_dict = {}

for indx, all_comp in enumerate(all_companies):
    report_rel_dict[all_comp] = len(all_relevant[indx])
    report_all_dict[all_comp] = len(all_text[indx])

In [63]:
ds = [report_rel_dict, report_all_dict]

In [64]:
sentences_stat = pd.DataFrame(ds).transpose()
sentences_stat = sentences_stat.rename(columns = {0:"relevant_text", 1:'all_text'})

In [65]:
sentences_stat

Unnamed: 0,relevant_text,all_text
EliLilly,12,94
UnitedHealthGroup,0,721
Merck,8,1874
BristolMyersSquibb,4,1051
Danaher,2,720
...,...,...
Dow,4,2196
Dominion_Energy,1,587
Duke_Energy,20,792
AEP,15,1501


In [66]:
sentences_stat['percentage'] = round(sentences_stat['relevant_text'] / sentences_stat['all_text'] * 100, 2)

In [67]:
sentences_stat['percentage'] = sentences_stat['percentage'].astype(str) + "%"

In [68]:
sentences_stat

Unnamed: 0,relevant_text,all_text,percentage
EliLilly,12,94,12.77%
UnitedHealthGroup,0,721,0.0%
Merck,8,1874,0.43%
BristolMyersSquibb,4,1051,0.38%
Danaher,2,720,0.28%
...,...,...,...
Dow,4,2196,0.18%
Dominion_Energy,1,587,0.17%
Duke_Energy,20,792,2.53%
AEP,15,1501,1.0%


In [69]:
sentences_stat.to_csv("sentences_stat.csv")

### Aggregating all the csv files into total_relevanat_texts and total_all_texts

In [70]:
industry_types = ['CONSTA', 'CONDIS', 'Energy', 'IND', 'IT', 'HC', 'Materials', 'Real_Estate', 'Utilities']

file_lists = []

for i in industry_types:    
    rel_file_name = i + "_reltext.csv"
    file_lists.append(rel_file_name)

In [71]:
file_lists

['CONSTA_reltext.csv',
 'CONDIS_reltext.csv',
 'Energy_reltext.csv',
 'IND_reltext.csv',
 'IT_reltext.csv',
 'HC_reltext.csv',
 'Materials_reltext.csv',
 'Real_Estate_reltext.csv',
 'Utilities_reltext.csv']

In [72]:
path = "/Users/tylerryoo/t3/sentences_csv"
files = [os.path.join(path, file) for file in os.listdir(path)]
total_rel_text = pd.concat((pd.read_csv(f) for f in files if f.endswith('reltext.csv')), ignore_index=True).reset_index()
total_rel_text = total_rel_text['0']
total_rel_text.to_csv("total_relevant_texts.csv", encoding = 'utf-8-sig')

In [73]:
total_all_text = pd.concat((pd.read_csv(f) for f in files if f.endswith('alltext.csv')), ignore_index=True).reset_index()
total_all_text = total_all_text['0']
total_all_text.to_csv("total_all_texts.csv", encoding = 'utf-8-sig')

### Save the extracted files to Aggregated_sentences_csv Folder 

- total_relevant_texts.csv: final version of relevant texts
- total_all_texts.csv: final version of all_texts
- aggregated_sentences.csv: both relevant and all sentences 
- all_sentences.csv: all sentences only
- relevant_sentences.csv: relevant sentences only
- sentences_stat.csv: extracted data statistics

