In [None]:
#!pip install requests
#!pip install beautifulsoup4
#!pip uninstall -y requests-html
#!pip install git+https://github.com/psf/requests-html.git
#!pip install lxml_html_clean 

In [None]:
import datetime
from bs4 import BeautifulSoup
from requests_html import AsyncHTMLSession
import re

In [None]:
import os

account = {
  'username': os.getenv('DOIT_IM_USERNAME'),
  'password': os.getenv('DOIT_IM_PASSWORD')
}

output_file = os.getenv('DOIT_IM_EXPORT_FILE') or "doit-im-export.json"

In [None]:
# from google.colab import userdata

# account = {
#   'username': userdata.get('DOIT_IM_USERNAME'),
#   'password': userdata.get('DOIT_IM_PASSWORD'),
# }
#
# output_file = userdata.get('DOIT_IM_EXPORT_FILE') or "doit-im-export.json"

In [None]:
sleep_default = 2
sleep_task = 0.5

url_home = "https://i.doit.im/home/"
url_login = 'https://i.doit.im/signin'

In [None]:
asession = AsyncHTMLSession()
res = None

In [None]:
async def login():
  global asession, res
  res = await asession.get(url_login)
  res = await asession.post(url=url_login, data=account, cookies=res.cookies)
  assert(res.status_code == 200)

def output_html(text):
  with open("response.html", "w") as f:
    print(text, file=f)

async def get_html(url, sleep=sleep_default):
  global asession, res
  res = await asession.get(url, cookies=res.cookies)
  assert(res.status_code == 200)
  await res.html.arender(cookies=res.cookies, send_cookies_session=True, sleep=sleep)
  #output_html(res.html.html)  # for debug
  return res.html.html

In [None]:
get_group_title = lambda g: g.find('span', class_='group-title').get_text()
get_group_size = lambda g: len(g.find_all('a', class_='link-title'))
get_tasks_func = {
  'id': lambda g: list(map(lambda e: e['ng-href'], g.find_all('a', class_='link-title'))),
  'title': lambda g: list(map(lambda e: e.get_text(), g.find_all('a', class_="link-title"))),
  'context': lambda g: list(map(lambda e: e.get_text()[1:], g.find_all('a', class_="context"))),
  'project': lambda g: list(map(lambda e: e.get_text()[1:], g.find_all('a', class_="project"))),
  #'startat': lambda g: list(map(lambda e: e.get_text(), g.find_all('div', class_="start-at"))),  # available only in #/schedule
  'priority': lambda g: list(map(lambda e: e.parent.parent.select_one('div.pri').attrs['class'][-1][-1], g.find_all('a', class_='link-title'))),
  'notes': lambda g: list(map(lambda e: e['html-title'], g.find_all('div', class_="comment"))),
  'repeater': lambda g: list(map(lambda e: True if e.parent.select("div.repeat:not(.ng-hide)") else False, g.find_all('a', class_='link-title'))),
}
keys = get_tasks_func.keys()

def get_group_contents(g):
  all_items_data = {key: get_tasks_func[key](g) for key in keys}
  tasks = [None] * get_group_size(g)
  for i in range(0, len(tasks)):
    tasks[i] = dict(zip(keys, [all_items_data[key][i] for key in keys]))
  return {
    'group-title': get_group_title(g),
    'group-size': get_group_size(g),
    'tasks': tasks
  }

In [None]:
await login()
html = await get_html(url_home)
soup = BeautifulSoup(html, 'html.parser')

In [None]:
eles_category_id = soup.find_all('a', class_='side-item', attrs={'ng-href': re.compile('^#/')})
category_ids = list(map(lambda ele: ele['ng-href'], eles_category_id))
eles_category_name = soup.find_all('span', class_='tit ng-binding')
category_names = list(map(lambda ele: ele.get_text(), eles_category_name))
categories = dict(zip(category_ids, category_names))
categories

In [None]:
eles_project_id = soup.find_all('a', class_='side-item', attrs={'href': re.compile('^#/project/[^(all)]')})
project_ids = list(map(lambda ele: ele['href'], eles_project_id))
eles_project_name = soup.find_all('span', attrs={'ng-bind-html': 'project.name'})
project_names = list(map(lambda ele: ele.get_text(), eles_project_name))
projects = dict(zip(project_ids, project_names))
projects

In [None]:
eles_context_id = soup.find_all('a', class_='side-item', attrs={'href': re.compile('^#/context/[^(all)]')})
context_ids = list(map(lambda ele: ele['href'], eles_context_id))
eles_context_name = soup.find_all('span', attrs={'ng-bind-html': 'context.name'})
context_names = list(map(lambda ele: ele.get_text(), eles_context_name))
contexts = dict(zip(context_ids, context_names))
contexts

In [None]:
inbox = {'#/inbox': '収拾箱'}
inbox

In [None]:
others = {}
#others = { '#/completed': '完了', '#/trash': 'ゴミ箱' }   # include if you want
others

In [None]:
all_view_groups = inbox | categories | projects | contexts | others
all_view_groups

In [None]:
target_views_extracting_tasks = inbox | categories
target_views_extracting_tasks

In [None]:
now = datetime.datetime.now()

output = {}
output['unixtime'] = int(datetime.datetime.timestamp(now))
output['localtime'] = f'{now}'
output['views'] = {}

for (view_id, view_name) in all_view_groups.items():
  print(f'processing {view_id}: {view_name}')
  html = await get_html(url_home + view_id)
  soup = BeautifulSoup(html, 'html.parser')
  eles_group = soup.find_all('div', class_='group')
  if view_id in target_views_extracting_tasks:
    contents = list(map(lambda g: get_group_contents(g), eles_group))
    output['views'][view_id] = {'name': view_name, 'contents': contents}
  else: # get only names of context and projects
    output['views'][view_id] = {'name': view_name }

In [None]:
for contents in output['views']['#/scheduled']['contents']:
  for task in contents['tasks']:
    print(f"processing {task['id']}: {task['title']}")
    if task['repeater'] == True:
      html = await get_html(url_home + task['id'], sleep=sleep_task)
      soup = BeautifulSoup(html, 'html.parser')
      repeat = soup.select_one('div.item.repeat').get_text().strip()
      assert(repeat and repeat != '')  # try set sleep longer of get_html()
      task['repeat'] = repeat

In [None]:
for (view_id, view_name) in target_views_extracting_tasks.items():
  print(f'processing {view_id}: {view_name}')
  for contents in output['views'][view_id]['contents']:
    for task in contents['tasks']:
      print(f"  processing {task['id']}: {task['title']}")
      html = await get_html(url_home + task['id'], sleep=sleep_task)
      soup = BeautifulSoup(html, 'html.parser')
      s = soup.select_one('div.item.time').get_text()
      task['time'] = s
      r = soup.select_one('div.item.reminder').get_text()
      m1 = re.findall(r'(\d+ [分時日]).*前', r)
      m2 = re.findall(r'\d+\-\d+\-\d+ \d+:\d+', r)
      #display(f"'{s}','{r}','{m1}','{m2}'")
      if (m1 or m2):
        task['reminder'] = []
        task['reminder'] += [ m1[0] ] if m1 else []
        task['reminder'] += [ m2[0] ] if m2 else []

In [None]:
import json

output_json = json.dumps(output, ensure_ascii=False)
with open(output_file, "w", encoding='utf-8') as file:
  print(output_json, file=file)

# SandBox

In [None]:
output

In [None]:
output_json