Download spreadsheet from gdrive. [Tutorial](https://developers.google.com/sheets/api/quickstart/python)

# Download spread sheets

In [1]:
from os import path

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

In [2]:
# If modifying these scopes, delete the file token.json.
SCOPES = ["https://www.googleapis.com/auth/spreadsheets.readonly"]

# The ID and range of a sample spreadsheet.
SAMPLE_SPREADSHEET_ID = "1xXZAZ4lm1Jg4458QOUxxYt7ZBYtegm4ECiYEkvTJBIE"
SAMPLE_RANGE_NAME = "Class Data!A2:E"
TOKEN_FILE = "google_token.json"

In [3]:
creds = None

# if path.exists(TOKEN_FILE):
#     creds = Credentials.from_authorized_user_file(TOKEN_FILE, SCOPES)

# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file(TOKEN_FILE, SCOPES)
        creds = flow.run_local_server()
    # Save the credentials for the next run
    with open(TOKEN_FILE, "w") as token:
        token.write(creds.to_json())

try:
    service = build("sheets", "v4", credentials=creds)

    # Call the Sheets API
    sheet = service.spreadsheets()
    result = sheet.values().get(spreadsheetId=SAMPLE_SPREADSHEET_ID).execute()
    values = result.get("values", [])

    if not values:
        print("No data found.")
        return

    print("Name, Major:")
    for row in values:
        # Print columns A and E, which correspond to indices 0 and 4.
        print("%s, %s" % (row[0], row[4]))
except HttpError as err:
    print(err)

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=743681804304-aio5hltal3jh1tq5f2edd6eo1ev7d8le.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fspreadsheets.readonly&state=Ek6upuelgCmWnAvIoxxb2ll0LkyqJx&access_type=offline


KeyboardInterrupt: 

# Parse

In [1]:
import pandas as pd
from pymorphy2 import MorphAnalyzer

morph = MorphAnalyzer()

In [5]:
def process_xlsx(path: str, meta: str):
    excel = pd.ExcelFile(path)
    words = []
    for sheet_name in excel.sheet_names:
        sheet_df = pd.read_excel(excel, sheet_name=sheet_name, skiprows=1, header=None)
        print(sheet_name, sheet_df.shape)
        for col in sheet_df.columns[:2]:  # sometimes experts can put number in random column
            # 0 - base, 1 - advanced
            for word in sheet_df.loc[sheet_df[col].notna(), col]:
                weight = 1
                try:
                    if word[0] == "!":
                        weight = 5
                        word = word[1:]
                except Exception as e:
                    print(sheet_name, word, e)
                words.append(
                    {
                        "meta": meta,
                        "topic": sheet_name,
                        "weight": weight,
                        "type": col,
                        "word": morph.normal_forms(word)[0],
                    }
                )
    return words

In [6]:
df = []
print("E")
df.extend(process_xlsx("data/topics/Environmental.xlsx", "env"))
print("S")
df.extend(process_xlsx("data/topics/Social.xlsx", "social"))
print("G")
df.extend(process_xlsx("data/topics/Governance.xlsx", "gov"))
df = pd.DataFrame(df)
df.head()

E
Экология в целом (19, 2)
Климат (160, 2)
Энергия (170, 2)
Воздух (93, 2)


  warn(msg)


Вода (106, 2)
Отходы и циклическая экономика (92, 2)
Биоразнообразие (118, 2)
Рекультивация земель (77, 2)
Экологичность продукта (66, 2)
S
!Персонал в целом (12, 3)
Обучение и развитие (146, 6)
Сотрудники. Вовлеченность и мот (28, 4)
Оплата труда (91, 5)
Сотрудники. Здоровье и благопол (51, 3)
Сотрудники. Наем и увольнение (37, 5)
Сотрудники. Корпоративная культ (41, 4)
Сотрудники. Безопасность и охра (127, 4)
Сотрудники. Профсоюз и Коллекти (64, 4)
Потребители. Доступность (65, 1)
Потребители. Сервис и коммуника (83, 3)
Потребители. Персональные данны (56, 4)
Потребители. Здоровье и благопо (39, 2)
Потребители. Маркетинг, продажи (104, 4)
Потребители. Удовлетворенность (73, 4)
Потребители. Качество и безопас (124, 4)
Потребители. Ценовая политика (79, 4)
Поставщики в целом (30, 1)
Малый и локальный бизнес (30, 4)
Поставщики. Работники (22, 4)
Поставщики. Экология (14, 4)
Закупки и антикоррупция (80, 4)
Заинтересованные стороны (29, 2)
Коренные народы и местные сообщ (129, 2)
Сотрудни

Unnamed: 0,meta,topic,weight,type,word
0,env,Экология в целом,5,0,экологичный
1,env,Экология в целом,5,0,природоохранный
2,env,Экология в целом,1,0,экология
3,env,Экология в целом,5,0,воздействие
4,env,Экология в целом,5,0,окружающая среда


In [7]:
df = df[~df["word"].str.strip().eq("")]

In [8]:
df.to_csv("topic_words.csv.zip", index=False)
df.to_csv("topic_words.csv", index=False)