# Parse

- https://docs.google.com/spreadsheets/d/1XsEU61oSTsPvj_dfqg2_70h5hYBVkKfCxWZjfiV9CZk/edit
- https://docs.google.com/spreadsheets/d/1AlYOSc6dBr5XWa6EAAyLTIN1UhFCI9RCntKJzy7S-js/edit
- https://docs.google.com/spreadsheets/d/1QoNNw7JZGCbtL24pwDfBk57R1QIT1aoSU-kGjZM5ppk/edit

In [1]:
import pandas as pd
from pymorphy2 import MorphAnalyzer

morph = MorphAnalyzer()

In [2]:
def process_xlsx(path: str, meta: str):
    excel = pd.ExcelFile(path)
    words = []
    for sheet_name in excel.sheet_names:
        sheet_df = pd.read_excel(excel, sheet_name=sheet_name)
        # print(sheet_name, sheet_df.shape)
        try:
            for col in ["Base level", "Advanced level"]:  # sometimes experts can put number in random column
                # 0 - base, 1 - advanced
                for word in sheet_df.loc[sheet_df[col].notna(), col]:
                    weight = 1
                    try:
                        if word[0] == "!":
                            weight = 5
                            word = word[1:]
                    except Exception as e:
                        print(sheet_name, word, e)
                    words.append(
                        {
                            "meta": meta,
                            "topic": sheet_name,
                            "weight": weight,
                            "type": col,
                            "word": morph.normal_forms(word)[0],
                        }
                    )
        except Exception as e:
            print(sheet_name, e)
    return words

In [3]:
df = []
print("E")
df.extend(process_xlsx("data/topics/Environmental_en.xlsx", "env"))
print("S")
df.extend(process_xlsx("data/topics/Social_en.xlsx", "social"))
print("G")
df.extend(process_xlsx("data/topics/Governance_en.xlsx", "gov"))
df = pd.DataFrame(df)
df.head()

E
S
G


Unnamed: 0,meta,topic,weight,type,word
0,env,Environmental topic in general,5,Base level,eco-friendly
1,env,Environmental topic in general,5,Base level,environment
2,env,Environmental topic in general,5,Base level,impact
3,env,Environmental topic in general,5,Base level,nature
4,env,Environmental topic in general,5,Base level,nature protection


In [4]:
df = df[~df["word"].str.strip().eq("")]

In [5]:
df.loc[df["word"].str.contains("24.7"), "word"] = "24/7"

In [6]:
df.to_csv("topic_words_eng.csv.zip", index=False)
df.to_csv("topic_words_eng.csv", index=False)