In [None]:
import requests
import pandas as pd
from io import StringIO
import datetime
import json
from urllib.parse import urlencode
import time

# Получение данных через `Logs API`
## Logs API

`Logs API` позволяет выгрузить сырые данные со счетчика.

Документация по `Logs API` - https://yandex.ru/dev/metrika/doc/api2/logs/intro.html

Данные для этого кейса также доступны на Яндекс.Диске - https://disk.yandex.ru/d/sUmQmh_MnQWL4g?w=1

### Шаг 1: получаем токен
Для работы с API необходимо получить свой токен - https://yandex.ru/dev/oauth/doc/dg/tasks/get-oauth-token.html

Создаем приложение тут (указываем права для чтения в Яндекс.Метрике) - https://oauth.yandex.ru/client/new

Переходим по ссылке вида - `https://oauth.yandex.ru/authorize?response_type=token&client_id=<идентификатор приложения>`

Полученный токен можно сохранить в домашнюю директорию в файл `.yatoken.txt`

In [None]:
TOKEN = open('../.yatoken.txt').read().strip()

### Шаг 2: проверяем, можно ли создать запрос в Logs API

In [None]:
API_HOST = 'https://api-metrika.yandex.ru'
COUNTER_ID = 73226638
START_DATE = '2020-07-01'
END_DATE = '2020-09-30'
SOURCE = 'hits'
API_FIELDS = ('ym:pv:date', 'ym:pv:dateTime', 'ym:pv:URL', 'ym:pv:deviceCategory', 
         'ym:pv:operatingSystemRoot', 'ym:pv:clientID', 'ym:pv:browser', 'ym:pv:lastTrafficSource')


In [None]:
header_dict = {'Authorization': f'OAuth {TOKEN}',
'Content-Type': 'application/x-yametrika+json'
}

In [None]:
url_params = urlencode(
    [
        ('date1', START_DATE),
        ('date2', END_DATE),
        ('source', SOURCE),
        ('fields', ','.join(API_FIELDS))
    ]
)

url = '{host}/management/v1/counter/{counter_id}/logrequests/evaluate?'\
    .format(host=API_HOST, counter_id=COUNTER_ID) + url_params

r = requests.get(url, headers = header_dict)

In [None]:
r.status_code

In [None]:
json.loads(r.text)['log_request_evaluation']

### Шаг 3: создаем запрос

In [None]:
url_params = urlencode(
    [
        ('date1', START_DATE),
        ('date2', END_DATE),
        ('source', SOURCE),
        ('fields', ','.join(sorted(API_FIELDS, key=lambda s: s.lower())))
    ]
)
url = '{host}/management/v1/counter/{counter_id}/logrequests?'\
    .format(host=API_HOST,
            counter_id=COUNTER_ID) \
      + url_params

r = requests.post(url, headers=header_dict)

In [None]:
r.status_code

In [None]:
json.loads(r.text)['log_request']

In [None]:
request_id = json.loads(r.text)['log_request']['request_id']

In [None]:
request_id

### Шаг 4: ждем окончания обработки

In [None]:
status = 'created'
while status == 'created':
    time.sleep(60)
    print('trying')
    url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}' \
            .format(request_id=request_id,
                    counter_id=COUNTER_ID,
                    host=API_HOST)

    r = requests.get(url, headers=header_dict)
    if r.status_code == 200:
        status = json.loads(r.text)['log_request']['status']
        print(json.dumps(json.loads(r.text)['log_request'], indent = 4))
    else:
        raise(BaseException(r.text))

In [None]:
json.loads(r.text)['log_request']

In [None]:
parts = json.loads(r.text)['log_request']['parts']
parts

### Шаг 5: выгружаем данные

In [None]:
tmp_dfs = []
for part_num in map(lambda x: x['part_number'], parts):
    url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}/part/{part}/download' \
            .format(
                host=API_HOST,
                counter_id=COUNTER_ID,
                request_id=request_id,
                part=part_num
            )

    r = requests.get(url, headers=header_dict)
    if r.status_code == 200:
        tmp_df = pd.read_csv(StringIO(r.text), sep = '\t')
        tmp_dfs.append(tmp_df)
    else:
        raise(BaseError(r.text))
        
hits_df = pd.concat(tmp_dfs)

In [None]:
hits_df.shape

In [None]:
hits_df.to_csv('metrika_cloud_case_data_hits.csv', sep = '\t', index = False)

### Шаг 6: то же самое но для визитов

In [None]:
SOURCE = 'visits'
API_FIELDS = ('ym:s:date', 'ym:s:dateTime', 'ym:s:startURL', 'ym:s:deviceCategory', 
         'ym:s:operatingSystemRoot', 'ym:s:clientID', 'ym:s:browser', 'ym:s:lastTrafficSource', 'ym:s:purchaseRevenue', 'ym:s:purchaseID')


In [None]:
url_params = urlencode(
    [
        ('date1', START_DATE),
        ('date2', END_DATE),
        ('source', SOURCE),
        ('fields', ','.join(sorted(API_FIELDS, key=lambda s: s.lower())))
    ]
)
url = '{host}/management/v1/counter/{counter_id}/logrequests?'\
    .format(host=API_HOST,
            counter_id=COUNTER_ID) \
      + url_params

r = requests.post(url, headers=header_dict)

In [None]:
r.status_code

In [None]:
json.loads(r.text)['log_request']

In [None]:
request_id = json.loads(r.text)['log_request']['request_id']

In [None]:
request_id

In [None]:
status = 'created'
while status == 'created':
    time.sleep(60)
    print('trying')
    url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}' \
            .format(request_id=request_id,
                    counter_id=COUNTER_ID,
                    host=API_HOST)

    r = requests.get(url, headers=header_dict)
    if r.status_code == 200:
        status = json.loads(r.text)['log_request']['status']
        print(json.dumps(json.loads(r.text)['log_request'], indent = 4))
    else:
        raise(BaseException(r.text))

In [None]:
json.loads(r.text)['log_request']

In [None]:
parts = json.loads(r.text)['log_request']['parts']
parts

In [None]:
tmp_dfs = []
for part_num in map(lambda x: x['part_number'], parts):
    url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}/part/{part}/download' \
            .format(
                host=API_HOST,
                counter_id=COUNTER_ID,
                request_id=request_id,
                part=part_num
            )

    r = requests.get(url, headers=header_dict)
    if r.status_code == 200:
        tmp_df = pd.read_csv(StringIO(r.text), sep = '\t')
        tmp_dfs.append(tmp_df)
    else:
        raise(BaseError(r.text))
        
visits_df = pd.concat(tmp_dfs)

In [None]:
visits_df.shape

In [None]:
visits_df.to_csv('metrika_cloud_case_data_visits.csv', sep = '\t', index = False)