<a href="https://colab.research.google.com/github/urielmun/AI_lab/blob/main/TT_dataLoad.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import os
import re
import json
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from transformers import RobertaConfig, RobertaModel



## 테스트 데이터셋 만들기

In [2]:
!git clone https://github.com/Priya22/project-dialogism-novel-corpus.git

Cloning into 'project-dialogism-novel-corpus'...
remote: Enumerating objects: 180, done.[K
remote: Counting objects: 100% (180/180), done.[K
remote: Compressing objects: 100% (148/148), done.[K
remote: Total 180 (delta 32), reused 180 (delta 32), pack-reused 0 (from 0)[K
Receiving objects: 100% (180/180), 8.92 MiB | 18.38 MiB/s, done.
Resolving deltas: 100% (32/32), done.


In [11]:
# 발화문 추출/ 일반 문장 수 계산
def count_sentences_with_dialogue(text):
    dialogues = re.findall(r'"[^"]*"', text)
    remaining_text = re.sub(r'"[^"]*"', ' ', text)
    sentence_endings = r'(?<=[.!?])(?=\s|[A-Z]|$)'
    other_sentences = re.split(sentence_endings, remaining_text)
    other_sentences_count = len([s for s in other_sentences if s.strip()])
    return round((len(dialogues)/other_sentences_count),2)

# --- 테스트 ---
test_text = """
He said, "Hello! My name is John. What is yours?" and then he walked away.
It was a sunny day. "Wait for me!" someone shouted.
"""
count = count_sentences_with_dialogue(test_text)
print(f"최종 문장 수: {count}개")

최종 문장 수: 0.67개


In [6]:
# 반복 제거를 위한 PDNC 책 리스트
PDNC_index_file_path='/content/project-dialogism-novel-corpus/PDNC-Novel-Index.csv'
PDNC_index=pd.read_csv(PDNC_index_file_path)

pdnc_book_title=(PDNC_index['Novel Title'])
print(pdnc_book_title)

pdnc_booktitle_set=set(pdnc_book_title)
print(pdnc_booktitle_set)

PDNC_DATA_FILEPATH='project-dialogism-novel-corpus/data'

0                    A Handful Of Dust
1     Alice's Adventures in Wonderland
2                 Anne Of Green Gables
3                   A Passage to India
4                   A Room With A View
5                         Daisy Miller
6                                 Emma
7                           Hard Times
8                          Howards End
9                       Mansfield Park
10                       Night and Day
11                    Northanger Abbey
12                        Oliver Twist
13                          Persuasion
14                 Pride and Prejudice
15               Sense and Sensibility
16                The Age of Innocence
17                       The Awakening
18                         The Gambler
19                   The Invisible Man
20            The Man Who Was Thursday
21     The Mysterious Affair At Styles
22          The Picture Of Dorian Gray
23                The Sign of the Four
24               The Sport of the Gods
25                  The S

In [12]:
# PDNC 데이터의 발화문/일반문장 평균값 구하기
#data 폴더 안의 소설 텍스트의 발화문/일반 문장 모두 구하기
# 발화문/일반 문장의 평균값 구하기
data_folder_dir = '/content/project-dialogism-novel-corpus/data'
data_list=os.listdir(data_folder_dir)
print(data_list)
speech_rate_list=[]
for book in data_list:
    file_path=f'/content/project-dialogism-novel-corpus/data/{book}/novel_text.txt'
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        speech_rate=count_sentences_with_dialogue(text)
        speech_rate_list.append(speech_rate)
print(speech_rate_list)

speech_rate_mean=sum(speech_rate_list)/len(speech_rate_list)
print(speech_rate_mean)


['NorthangerAbbey', 'APassageToIndia', 'TheInvisibleMan', 'AHandfulOfDust', 'AlicesAdventuresInWonderland', 'TheSportOfTheGods', 'TheSunAlsoRises', 'ThePictureOfDorianGray', 'DaisyMiller', 'PrideAndPrejudice', 'TheSignOfTheFour', 'OliverTwist', 'TheAgeOfInnocence', 'HardTimes', 'SenseAndSensibility', 'TheManWhoWasThursday', 'MansfieldPark', 'TheAwakening', 'TheGambler', 'Emma', 'HowardsEnd', 'NightAndDay', 'WhereAngelsFearToTread', 'ARoomWithAView', 'WinnieThePooh', 'AnneOfGreenGables', 'Persuasion', 'TheMysteriousAffairAtStyles']
[0.62, 0.63, 0.51, 1.21, 1.61, 0.54, 0.8, 0.55, 0.76, 0.57, 0.77, 0.73, 0.54, 0.76, 0.52, 0.73, 0.44, 0.31, 0.57, 0.58, 0.7, 0.44, 0.56, 0.71, 1.26, 0.69, 0.44, 0.71]
0.687857142857143


In [None]:
#PDNC 데이터 장르, 시대 구하기
#장르
#시대


## 학습 데이터셋 만들기

In [14]:


class Gutenberg_English_Preprocessor:
  """
  A text preprocessor designed to clean Project Gutenberg text data.
  This class removes unwanted patterns like:
  - Blocks enclosed in '=' lines
  - Sentences containing "Gutenberg" (case insensitive)
  - "Small Print" sections from Project Gutenberg files
  - Blocks enclosed in '*' patterns
  """

  def __init__(self, text: str):
      """
      Initializes the Gutenberg_English_Preprocessor with the provided text.

      Args:
          text (str): The text content to be processed.
      """
      self.text = text

  def remove_equal_sign_blocks(self):
      """
      Removes blocks of text enclosed by lines containing only '=' symbols.

      Example:
      ========================
      This content will be removed.
      ========================
      """
      equal_block_pattern = r'^\s*=+\s*\n(?:.*?\n)*?\s*=+\s*$'
      self.text = re.sub(equal_block_pattern, '', self.text, flags=re.MULTILINE)
      self.text = self.text.strip()

  def remove_gutenberg_sentences(self):
      """
      Removes sentences that contain the word "Gutenberg" in any case format.

      Example:
      "This is a Project Gutenberg text."  → Removed
      "Random sentence without Gutenberg." → Removed
      "This is a normal sentence."          → Retained
      """
      gutenberg_pattern = r'^[^\n]*\bgutenberg\b[^\n]*\n?'
      self.text = re.sub(gutenberg_pattern, '', self.text, flags=re.IGNORECASE | re.MULTILINE)
      self.text = self.text.strip()

  def remove_small_print(self):
      """
      Removes Project Gutenberg's "Small Print" sections.
      These sections often contain legal disclaimers and metadata.
      """
      pattern1 = r'\*\*\*START\*\*THE SMALL PRINT.*?\*END\*THE SMALL PRINT!'
      pattern2 = r'\*\*\*START\*\*THE SMALL PRINT.*?\*END THE SMALL PRINT'

      self.text = re.sub(pattern1, '', self.text, flags=re.DOTALL)
      self.text = re.sub(pattern2, '', self.text, flags=re.DOTALL)
      self.text = self.text.strip()

  def start_end(self):
      """
      Trims the text to retain only the content between:
      - "*** START OF THE PROJECT GUTENBERG..."
      - "*** END OF THE PROJECT GUTENBERG..."

      Ensures non-essential content outside these markers is excluded.
      """
      str_str = "*** START OF THE PROJECT GUTENBERG"
      end_str = "*** END OF THE PROJECT GUTENBERG"

      start_idx = self.text.find(str_str)
      end_idx = self.text.find(end_str)

      if start_idx != -1 and end_idx != -1:
          self.text = self.text[start_idx:end_idx]

  def remove_patterns(self):
      """
      Removes patterns enclosed by '*' characters, such as:
      - Inline patterns like "* text *", "** text **", etc.
      - Standalone patterns and multi-line blocks enclosed in '*'
      """
      star_pattern = r'^\s*\*{1,4}.*?\*{1,4}\s*$'
      self.text = re.sub(star_pattern, '', self.text, flags=re.MULTILINE | re.DOTALL)
      self.text = self.text.strip()

  def preprocess(self):
      """
      Executes the full text preprocessing pipeline by calling all individual
      cleaning functions in the desired sequence.

      Returns:
          str: The cleaned and processed text content.
      """
      self.start_end()
      self.remove_small_print()
      self.remove_patterns()
      self.remove_equal_sign_blocks()
      self.remove_gutenberg_sentences()
      return self.text


In [15]:
MAX_BOOKS = 10  # 저장할 최대 책 권수

datasets = load_dataset(
    "incredible45/Gutenberg-BookCorpus-Cleaned-Data-English",
    split="train",
    streaming=True
)

count = 0
save_dir="Train_Data"
os.makedirs(save_dir,exist_ok=True)

filename=os.path.join(save_dir,"train.jsonl")

with open(filename,"w",encoding="utf-8") as f:
    for example in tqdm(datasets, desc="saving books"):
        book_title = example.get("book_title", "untitled")

        #PDNC에 저장된 책 이름과 같다면 저장하지 않음.
        if book_title in pdnc_booktitle_set:
            print(f"Skip (Already in PDNC): {book_title}")
            continue

        author= example.get("author", "")
        context = example.get("context", "")

        # context 전처리
        cleaning_processor=Gutenberg_English_Preprocessor(context)
        cleaned_text = cleaning_processor.preprocess()
        context=cleaned_text

        #최소 발화문 비율 충족(작성 필요)
        speech_rate=count_sentences_with_dialogue(context)
        if speech_rate<speech_rate_mean:
            continue

        record={
            "book_title":book_title,
            "author":author,
            "context":context
        }
        f.write(json.dumps(record,ensure_ascii=False)+"\n")
        print("Save_success: ",book_title)

        count +=1

        if count>=MAX_BOOKS:
            break


Resolving data files:   0%|          | 0/43 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/43 [00:00<?, ?it/s]

saving books: 15it [00:16,  1.47it/s]

Skip (Already in PDNC): Alice's Adventures in Wonderland


saving books: 83it [00:19, 25.42it/s]

Save_success:  North American Free Trade Agreement, 1992 Oct. 7 Tariff Phasing Descriptions


saving books: 93it [00:20, 17.52it/s]

Skip (Already in PDNC): Persuasion


saving books: 103it [00:21, 20.74it/s]

Skip (Already in PDNC): Northanger Abbey


saving books: 122it [00:21, 21.51it/s]

Skip (Already in PDNC): Mansfield Park


saving books: 131it [00:22, 21.69it/s]

Save_success:  A Little Princess
Being the whole story of Sara Crewe now told for the first time


saving books: 134it [00:22, 22.02it/s]

Save_success:  The Rise of Silas Lapham
Skip (Already in PDNC): Emma
Skip (Already in PDNC): Sense and Sensibility


saving books: 151it [00:23, 27.83it/s]

Save_success:  The Well at the World's End: A Tale


saving books: 157it [00:23, 19.49it/s]

Save_success:  The Project Gutenberg RST Manual


saving books: 189it [00:24, 20.52it/s]

Save_success:  Child Christopher and Goldilind the Fair


saving books: 245it [00:27, 28.65it/s]

Save_success:  The Flirt


saving books: 271it [00:28, 36.18it/s]

Save_success:  The Princess Aline


saving books: 284it [00:28, 40.10it/s]

Save_success:  Fables


saving books: 292it [00:28, 10.09it/s]

Save_success:  The Story of a Pioneer





In [16]:
import pandas as pd

# 파일 경로 설정
file_path = "/content/Train_Data/train.jsonl"

# JSONL 파일 읽기
df = pd.read_json(file_path, lines=True)

# 표 출력 (상위 5개 데이터)
df.head()

Unnamed: 0,book_title,author,context
0,"North American Free Trade Agreement, 1992 Oct....",Canada,"</div></section><h6 id=""id00000"">DRAFT NAFTA T..."
1,A Little Princess\r\nBeing the whole story of ...,"Burnett, Frances Hodgson",A Little Princess\n\n\nby\n\nFrances Hodgson B...
2,The Rise of Silas Lapham,"Howells, William Dean",This eBook is for the use of anyone anywhere a...
3,The Well at the World's End: A Tale,"Morris, William",This eBook is for the use of anyone anywhere a...
4,The Project Gutenberg RST Manual,"Perathoner, Marcello","&gt; install-tl</span>\r\n</pre>\r\n<p class=""..."
