In [None]:
# v 1.5.1

import os
from contextlib import closing
from datetime import datetime, timedelta

import boto3

import psycopg2
import pandas as pd
import glob

date = (datetime.now() - timedelta(hours=3)).isoformat().split('T')[0]
MTS_DIR = 'MTS audio sets'
MTS_PATH = os.path.join(MTS_DIR, date)

DB_data = '''
    host = YOUR_HOST
    port = YOUR_PORT
    sslmode=require
    dbname = YOUR_DBNAME
    port = YOUR_USER
    password = YOUR_PASSWORD
    target_session_attrs=read-write
'''

s3_client = boto3.client(
    service_name = 's3',
    endpoint_url = 'https://storage.yandexcloud.net',
    aws_access_key_id = YOUR_AWS_ACCESS_KEY_ID,
    aws_secret_access_key = YOUR_AWS_ACCESS_ACCESS_KEY
)

with closing(psycopg2.connect(DB_data)) as conn:
    with closing(conn.cursor()) as cursor:
        cursor.execute('''SELECT assignment_id, worker_id,
                                              gender, age
                                              FROM public.sets''')

        all_sets_in_db_df = pd.DataFrame(cursor.fetchall(),
                                         columns=['assignment_id', 'worker_id',
                                                  'gender', 'age'])


def error_writer(request: str) -> None:
    with open('errors.tsv', 'a', encoding='utf-8') as file:
        file.write(request)


def upload_file_to_s3(path: str) -> str:
    key = path.replace('\\', '/')
    print('3агружаем файл...')
    with open(path, 'rb') as file:
        content = file.read()
    s3_client.put_object(
    Body=content,
    Bucket='sbs-toloka',
    Key=key
    )

    url = f'https://storage.yandexcloud.net/sbs-toloka/{key}'
    print(url)
    return url


def db_update(assignment_id: str) -> None:
    try:
        with closing(psycopg2.connect(DB_data)) as conn:
            with closing(conn.cursor()) as cursor:
                cursor.execute(f''' UPDATE public.sets SET status = 'INWORK', send_date = '{date}' WHERE assignment_id ='{assignment_id}' ''')
                conn.commit()
                print('Обновили данные в базе')

    except Exception as e:
        print('Не удалось сохранить в базу данных')
        error_writer(f'{assignment_id}\t{e}')


if not os.path.exists(f'{date}_sets_data.xlsx'):
    today_df = pd.DataFrame()
else:
    today_df = pd.read_excel(f'{date}_sets_data.xlsx', sheet_name='Sheet1')

if not os.path.exists(f'{date}_sets_s3_links.xlsx'):
    today_s3_links_df = pd.DataFrame()
else:
    today_s3_links_df = pd.read_excel(f'{date}_sets_s3_links.xlsx', sheet_name='Sheet1')

problems_sets = list()
norm_sets = list()

for folder in os.listdir(MTS_PATH):
    mts_set = os.path.join(MTS_PATH, folder)
    mp4 = len(glob.glob1(mts_set, '*.mp4'))
    wav = len(glob.glob1(mts_set, '*.wav'))
    txt = len(glob.glob1(mts_set, '*.txt'))
    csv = len(glob.glob1(mts_set, '*.csv'))
    if mp4 != 1 or wav != 9 or txt != 6 or csv != 1:
        problems_sets.append(folder)
    else:
        norm_sets.append(folder)
        continue

# exit()

for root, subdirectories, files in os.walk(MTS_DIR):
    if files:
        for file in files:
            path = os.path.join(root, file)
            if path.split('\\')[-2] in norm_sets:
                # print(path)
                s3_link = upload_file_to_s3(path)
            # if '.csv' in file:
            #     assignment_id = file.replace('.csv', '')
            #     print('Айди сета: ', assignment_id)
            #     assignment_df = all_sets_in_db_df.loc[all_sets_in_db_df['assignment_id']==assignment_id]
            #     age = assignment_df.loc[assignment_df['assignment_id']==assignment_id, 'age'].values[0]
            #     if 4 <= int(age) <= 6:
            #         new_age = '4-6'
            #     elif 7 <= int(age) <= 9:
            #         new_age = '7-9'
            #     elif 10 <= int(age) <= 12:
            #         new_age = '10-12'
            #     assignment_df.loc[assignment_df['assignment_id']==assignment_id, 'age'] = new_age
            #     today_df = pd.concat([today_df, assignment_df])
            # s3_links_df = pd.DataFrame(data={'assignment_id':[assignment_id], 's3_link':[s3_link]})
            # today_s3_links_df = pd.concat([today_s3_links_df, s3_links_df])
        # print('-' * 50)

for assignment_id in norm_sets:
    db_update(assignment_id)

sets = pd.DataFrame(data={'date': date, 'assignment_id': norm_sets})
sets.drop_duplicates(subset='assignment_id').to_excel(f'{date}_sets_data.xlsx', index=False)
# today_s3_links_df.drop_duplicates(subset='s3_link').to_excel(f'{date}_sets_s3_links.xlsx', index=False)

if problems_sets:
    print(f'В сет(-е/-ах) {", ".join(map(str, problems_sets))} не хватает файлов, остальные выгружены.')
else:
    print('Все сеты выгружены :)')