In [1]:
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.http import MediaIoBaseDownload
import os.path
import pickle
from docx import Document
import io
import csv
import re

In [2]:
SCOPES = ['https://www.googleapis.com/auth/drive', 'https://www.googleapis.com/auth/documents.readonly']
FOLDER_ID = '1HJG2GJ_flNop6bb8n5JORhb0KKlCvuHz'

In [4]:
def authenticate():
    creds = None
    # Token.pickle stores the user's credentials
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    # If no valid credentials, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)
    return creds


creds = authenticate()

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=523587390725-5b4deohlkhi0grlevmrci0sko9e76l8b.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A65198%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocuments.readonly&state=G69mJ5J1LkpVDdfcN3K9HcgYRrXrU8&access_type=offline


In [7]:
def list_files(service, folder_id):
    results = service.files().list(
        q=f"'{folder_id}' in parents and trashed=false",
        pageSize=1000,
        fields="nextPageToken, files(id, name, mimeType, webViewLink)").execute()
    items = results.get('files', [])
    return items

def recurse_folders(service, folder_id, path=""):
    items = list_files(service, folder_id)
    names = dict()
    for item in items:
        name = item['name']
        if item['mimeType'] not in ('application/vnd.google-apps.folder', 'application/vnd.google-apps.document'):
            if '.' in name[-5:]:
                name = name[:name.rfind('.')].strip()
        if '~$' in name:
            continue
        name = name.strip()
        copies = names.setdefault(name, [])
        copies.append(item)
    # for k in sorted(names.keys()):
    #     print(f"{path}{k}")
    copied = False
    for name, items in names.items():
        gdocs = [item for item in items if item['mimeType'] == 'application/vnd.google-apps.document']
        words = [item for item in items if item['mimeType'] in (
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
            'application/msword'
        )]
        if words and not gdocs:
            print(f"Converting {path}/{words[0]['name']} to Google Docs.... ({name})")
            try:
                service.files().copy(fileId=words[0]['id'], body={'name': name, 'mimeType': 'application/vnd.google-apps.document'}).execute()
                copied = True
            except:
                print(f"Failed to converte{path}/{words[0]['name']} [{words[0]['mimeType']}] to Google Docs")
    if copied:
        yield from recurse_folders(service, folder_id, path)
    else:
        for name, items in names.items():
            gdocs = [item for item in items if item['mimeType'] == 'application/vnd.google-apps.document']
            folders = [item for item in items if item['mimeType'] == 'application/vnd.google-apps.folder']
            if gdocs:
                yield dict(id=gdocs[0]['id'], name=name, link=gdocs[0]['webViewLink'], path=path)
            elif folders:
                yield from recurse_folders(service, folders[0]['id'], path + name + "/")

service = build('drive', 'v3', credentials=creds)
docs = list(recurse_folders(service, FOLDER_ID))
print(f"Found {len(docs)} documents")

Converting /מדיניות הטיפול בנוצץ.msg to Google Docs.... (מדיניות הטיפול בנוצץ)
Failed to converte/מדיניות הטיפול בנוצץ.msg [application/msword] to Google Docs
Converting /נוהללטיפולבפניותהציבור (2).doc to Google Docs.... (נוהללטיפולבפניותהציבור (2))
Failed to converte/נוהללטיפולבפניותהציבור (2).doc [application/msword] to Google Docs
Converting 5.27//תעס מדיניות השירות לעיוור - 6.8הערות נוספות של אסא.msg to Google Docs.... (תעס מדיניות השירות לעיוור - 6.8הערות נוספות של אסא)
Failed to converte5.27//תעס מדיניות השירות לעיוור - 6.8הערות נוספות של אסא.msg [application/msword] to Google Docs
Converting 5.27//הערות אסא להוראה.msg to Google Docs.... (הערות אסא להוראה)
Failed to converte5.27//הערות אסא להוראה.msg [application/msword] to Google Docs
Converting 1.42//RE טיוטת נוהל  טיפול בילודים ומשפחותיהם של משרד הבריאות.msg to Google Docs.... (RE טיוטת נוהל  טיפול בילודים ומשפחותיהם של משרד הבריאות)
Failed to converte1.42//RE טיוטת נוהל  טיפול בילודים ומשפחותיהם של משרד הבריאות.msg [applicati

In [86]:
DEFS = re.compile(r'\bהגדרות\b|\bה ג ד ר ו ת\b', re.IGNORECASE)
SPACES = re.compile(r' +')

def get_document_contents(service, document_id):
    doc = service.documents().get(documentId=document_id).execute()
    content = doc.get('body').get('content')
    return content

def find_definitions_section(contents, debug=False):
    definitions = []
    in_definitions = False
    in_definitions_digit = None
    in_definitions_errors = 0
    stop = False
    count = 10000
    for element in contents:
        if stop:
            break
        text = ''
        if 'paragraph' in element:
            if stop:
                break
            paragraph = element['paragraph']
            elements = paragraph.get('elements')
            # style = paragraph['paragraphStyle']['namedStyleType']
            for elem in elements:
                if 'textRun' in elem:
                    elem_text = elem['textRun']['content'].replace('\x0b', '\n').replace('\xa0', '')
                    text += elem_text
        text = text.replace('\x0b', '\n').replace('\x0a', '\n').replace('\x0d', '')
        text = SPACES.sub(' ', text)
        text = text.split('\n')
        for ptext in text:
            ptext = ptext.strip()
            if not ptext:
                continue
            count -= len(ptext)
            if count < 0 and not in_definitions:
                stop = True
                break
            if debug:
                print(f"ZZZ: {ptext!r}, {DEFS.search(ptext)}, {len(ptext)}")
            # print(">>>>>", style, ptext[:30])
            if DEFS.search(ptext) and len(ptext) < 15:
                in_definitions = True
                if ptext[0].isdigit() or ptext[1] == '.':
                    in_definitions_digit = ptext[0]
                if debug:
                    print(f'DDDDD {ptext!r}\n--- {in_definitions_digit!r}')
                continue
            if ' הגדרות' in ptext:
                print(f'ERROR: Found "הגדרות" in {ptext!r}')
            if ' ה ג ד ר ו ת' in ptext:
                print(f'ERROR: Found "ה ג ד ר ו ת" in {ptext!r}')

            if in_definitions:
                # if in_definitions_style and 'HEADER' in paragraph['paragraphStyle']['namedStyleType']:
                #     # Check if we encounter another heading after starting definitions
                #     if definitions:  # If already collected some text, break, assuming section end
                #         break
                if debug:
                    print(f'DDDDD1 {ptext[:5]!r}\n --- {in_definitions_digit!r}')
                if ptext[0].isdigit() or ptext[1] == '.':
                    digit = ptext[0]
                    if not in_definitions_digit:
                        in_definitions_digit = digit
                    else:
                        if digit != in_definitions_digit:
                            # print('?????', digit, '!=', in_definitions_digit, '?????')
                            stop = True
                            break

                while ptext and (ptext[0].isdigit() or ptext[0] == '.' or ptext[1] == '.'):
                    ptext = ptext[1:]
                    ptext = ptext.strip()
                if not ptext:
                    continue

                definition = ptext[2:50]
                delimiter = None
                for delimiter in ['-', ':', '–']:
                    if delimiter in definition:
                        definition = ptext.split(delimiter, 1)
                        definition = [x.strip() for x in definition]
                        definitions.append(definition)
                        in_definitions_errors = 0
                        break
                else:
                    if definitions:
                        definitions[-1][1] += ' ' + ptext
                        in_definitions_errors += 1
                        if in_definitions_errors > 2:
                            print(f'ERROR: too many errors in definitions\n({ptext})')
                            in_definitions = False
                            in_definitions_digit = None
                            in_definitions_errors = 0
                    else:
                        print(f'ERROR: no delimiter found in {definition!r}\n({ptext})')
                        in_definitions = False
                        in_definitions_digit = None
                        in_definitions_errors = 0
                
                # print(f'>>>>({paragraph.style.name}): {ptext}<<<<')
                        
    return definitions

In [None]:
def get_document_contents(service, file_id):
    request = service.files().get_media(fileId=file_id)
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while done is False:
        status, done = downloader.next_chunk()
    fh.seek(0)
    doc = Document(fh)
    return doc

def find_definitions_section(document):
    definitions = []
    in_definitions = False
    in_definitions_digit = None
    for paragraph in document.paragraphs:
        ptext = paragraph.text.strip()
        if not ptext:
            continue
        ptextline = ptext.split('\n')[0][:15]
        if ptextline.replace(':', '').endswith('הגדרות'):
            in_definitions = True
            if ptext[0].isdigit():
                in_definitions_digit = ptext[0]
            # print('!!!!!', ptext, '!!!!!')
            continue
        
        assert 'הגדרות' not in ptextline, f'ERROR: spurious definitions {ptext}, {repr(ptextline)}'
        if in_definitions:
            if paragraph.style.name.startswith('Heading') or paragraph.style.name == 'List Paragraph':
                if text:  # Assume section end if another heading is encountered
                    break
            if ptext[0].isdigit():
                digit = ptext[0]
                if not in_definitions_digit:
                    in_definitions_digit = digit
                else:
                    if digit != in_definitions_digit:
                        # print('?????', digit, '!=', in_definitions_digit, '?????')
                        break

            while ptext and (ptext[0].isdigit() or ptext[0] == '.'):
                ptext = ptext[1:]
                ptext = ptext.strip()
            if not ptext:
                continue

            definition = ptext[:50]
            delimiter = None
            for delimiter in ['-', ':', '–']:
                if delimiter in definition:
                    break
            else:
                print(f'ERROR: no delimiter found in {definition!r}\n({paragraph.style.name}, {ptext})')
                break
            definition = ptext.split(delimiter, 1)
            definition = [x.strip() for x in definition]
            definitions.append(definition)
            # print(f'>>>>({paragraph.style.name}): {ptext}<<<<')
    return definitions


In [88]:
import time
docs_service = build('docs', 'v1', credentials=creds)

collected = []
print(len(docs))
for i, doc in enumerate(docs):
    # if doc['legacy'] is False:
    # if doc['id'] != '174rpFY7OVvNqyr7vmFwG16WBjDUzL_uQhAxBpX4V5dQ': continue
    print(f"{i:03d}:Document: {doc['name']} - {doc['link']}")
    for retry in range(3):
        try:
            text = get_document_contents(docs_service, doc['id'])
            definitions = find_definitions_section(text)#, doc['id'] == '1TzyUHUbdj9MYeufkBZI4YbLLxZhHnwLuVfKvPpLxCFc')
            for definition in definitions:
                collected.append((doc['name'], doc['path'], definition[0], definition[1]))
                print(f" - {definition[0]}::")
                print(f"   {definition[1]}")
            break
        except TimeoutError as e:
            print(f'RETRYING... {retry+1}/3')
            time.sleep(60)
            continue

        except Exception as e:
            # raise
            # clsname = e.__class__.__name__
            # if clsname in ('sasad'):#'ValueError', 'BadZipFile'):
            #     pass
            print(f'{i:03d}:ERROR: {doc['name']} :', e.__class__.__name__, e)
            print(f'  - {doc['link']}')
    time.sleep(1)


with open('definitions.pickle', 'wb') as f:
    pickle.dump(collected, f)

with open('definitions.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['Document', 'Path', 'Term', 'Definition'])
    for row in collected:
        writer.writerow(row)


679
000:Document: תוכנית יחדיו - https://docs.google.com/document/d/147gGBT6cciuKPYgiBf5SWvkmO_msT8JXkPWXVulEiiY/edit?usp=drivesdk
001:Document: מרכזיאיבחוןמסוגלות - https://docs.google.com/document/d/18XQFQ6ZCTgeKYLqnvLncm05oDDiTIdteTi8D7b83-DA/edit?usp=drivesdk
002:Document: טיפולישיניים - https://docs.google.com/document/d/1zoFQhq7Ke4gd0-hMpTdMobDTO9zR-5h0kkIxCMnkRNU/edit?usp=drivesdk
 - הפוליסה::
   הסכם על כל התנאים והתנאים הכלולים עפ"י מכרז – ביטוח שיניים והנספחים המצורפים אליו.
 - בעל הפוליסה::
   משרד העבודה והרווחה.
 - פנימיה::
   מוסד ילדים, משפחה אומנת, הוסטל או כל מסגרת אשר בה נמצאים חניכים כולל לינה שהופנו ע"י משרד העבודה והרווחה או ע"י שירותי הרווחה של הרשות המקומית או חניכים בפנימיות אשר משרד העבודה והרווחה משתתף במימון אחזקתם.
 - רופא מוסכם::
   רופא שיניים הקשור עם המבטח בהסכם למתן שירותים לפי הפוליסה למבוטחיו של המבטח.
003:Document: תעוד ורישום - https://docs.google.com/document/d/16awKBRSd8uwZNmWQl8eJnu0OIJthdMfo8vMwhvivzaI/edit?usp=drivesdk
004:Document: סיועמתקציבנ