In [1]:
from string import Template
TOPIC_INSTRUCTION_TEMPLATE = Template('<TOPIC>$content</TOPIC>')
KEYWORDS_INSTRUCTION_TEMPLATE = Template('<KEYWORDS>$content</KEYWORDS>')
CHAPTER_NAME_INSTRUCTION_TEMPLATE = Template('<CHAPTER_NAME>$content</CHAPTER_NAME>')
INPUT_TEMPLATE = Template('$topic|$keywords|$chapterName')
RATIO_THRESHOLD = 0.9

# topic = TOPIC_INSTRUCTION_TEMPLATE.substitute(content='test topic')
# keywords = KEYWORDS_INSTRUCTION_TEMPLATE.substitute(content='test keywords')
# chapterName = CHAPTER_NAME_INSTRUCTION_TEMPLATE.substitute(content='test chapterName')

# input = INPUT_TEMPLATE.substitute(topic=topic, keywords=keywords, chapterName=chapterName)

# print(input)

In [2]:
!pip install firebase-admin



In [3]:
import os
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
from google.colab import userdata

# Use a service account.
cred = credentials.Certificate(userdata.get('firebaseCredentialsPath'))

app = firebase_admin.initialize_app(cred)

db = firestore.client()

In [7]:
import re
def is_eligible_to_keep(text):
    # Remove non-alphabetic characters
    cleaned_text = re.sub(r'[^a-zA-Zа-яА-Я]', '', text)

    full_count = len(cleaned_text)
    cyrillic_count = len(re.findall(r'[а-яА-Я]', cleaned_text))

    # Calculate the ratio of Latin to Cyrillic characters
    if cyrillic_count == 0:  # Prevent division by zero
        ratio = cyrillic_count = 1

    return (cyrillic_count / full_count) >= RATIO_THRESHOLD

In [8]:
from google.cloud.firestore_v1.field_path import FieldPath
from google.cloud.firestore_v1.base_query import FieldFilter
import json

PAGE_SIZE = 100

docs_ref = db.collection("raw-documents")
query = docs_ref.order_by(FieldPath.document_id()).where(filter=FieldFilter("collections", "array_contains_any", ["Бакалаврські роботи", "Магістерські роботи"])).limit(PAGE_SIZE)

i = 0

result = []

while(True):

  docs = list(query.stream())

  if (len(docs) == 0): break;

  last_doc = docs[-1]
  last_id = last_doc.id

  for doc in docs:
    docDict = doc.to_dict()
    if docDict.get("chapters"):
      topic = TOPIC_INSTRUCTION_TEMPLATE.substitute(content=docDict.get("title"))
      keywords = KEYWORDS_INSTRUCTION_TEMPLATE.substitute(content=", ".join(docDict.get("keywords")))
      chapters = docDict.get("chapters")

      for key in chapters.keys():
        if "ЗМІСТ" not in key and "РЕФЕРАТ" not in key and "ABSTRACT" not in key and is_eligible_to_keep(chapters[key]):
          chapterName = CHAPTER_NAME_INSTRUCTION_TEMPLATE.substitute(content=key)

          input = INPUT_TEMPLATE.substitute(topic=topic, keywords=keywords, chapterName=chapterName)
          output = chapters[key]

          result.append({
              'input': input,
              'output': output
          })




  query = docs_ref.order_by(FieldPath.document_id()).where(filter=FieldFilter("collections", "array_contains_any", ["Бакалаврські роботи", "Магістерські роботи"])).start_after({FieldPath.document_id(): last_id}).limit(PAGE_SIZE)

  print(f"{i} - {last_id}")
  i = i + 1

with open("/content/drive/MyDrive/diploma-llm/data/fine-tuning/dataset.json", "w") as outfile:
  outfile.write(json.dumps(result, indent=4))

0 - 0192ff5c-411c-4828-ac7c-26dde2d313e3
1 - 033c7de0-42da-4af9-b47a-c6f1db680b77
2 - 04e1a6ae-eeed-4406-916f-6fca353c5b55
3 - 06aa15ba-4ef8-4ac6-b69c-916c150157d0
4 - 083f8290-0503-4382-83de-7227339db47b
5 - 09e26ced-c22b-479f-8cdb-a715e5c06da7
6 - 0b6116d5-9b8b-43f4-8431-b4ee37ec6577
7 - 0cf5ba33-7fa6-426c-ad5f-f37fb94b3993
8 - 0e9eff5d-1f9e-47f1-b996-51fd99991c7b
9 - 1070c1e2-f952-4272-bdd4-ed6941bf1487
10 - 12087028-7bf0-45de-bfe2-f16db339d854
11 - 138c1c83-4bbe-4b91-9866-15e67c22195f
12 - 15177e67-c1f9-4661-871a-f83368665be8
13 - 16c75759-9765-4e54-9226-b4117a7fdacc
14 - 180c647e-aab0-403e-91b8-f1d1ab32f5e9
15 - 19b7efe0-af93-4609-8c9e-2579921079f2
16 - 1b82a8ae-5e50-4ba2-9b67-9d39124c5061
17 - 1d0b5ef9-404e-4f68-8969-b99f39f9d2ed
18 - 1e61aba1-dd24-47d4-87d6-11e966844778
19 - 202c5975-5815-4e1a-bac6-f7a8249d3a6b
20 - 21ead3ad-ec8f-4066-b603-e12a17acacab
21 - 2375f648-79d9-49b9-bf1d-c9718b15dbbb
22 - 253888ce-b9d8-40ea-93a4-97aa222d7c5a
23 - 26ef49e8-fdde-470b-8711-f88b828c5ac0
24