In [586]:
import pandas as pd 
import numpy as np 
import configparser
import os
import re

config = configparser.ConfigParser()
config.read("env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
base_path = config['DEFAULT']['base_path']
code_dir = config['DEFAULT']['code_dir']


os.chdir(code_dir)
import preprocess
os.chdir(base_path)

OHCO = ['book_id','chap_num','sec_num','para_num', 'sent_num', 'token_num']
k = 18
all_dir_list = [f for f in os.listdir(data_home) if f.endswith('.txt')]
text_file = f"{data_home}/{all_dir_list[k-1]}"

In [587]:
text_file

'D:\\MSDS\\Spring Term\\DS5001 - Text Mining\\Mahabharata_NLP/data/maha18.txt'

In [588]:
# Reading the TextFile Line by Line and saving as Dataframe
LINES = pd.DataFrame(open(text_file, 'r', 
                            encoding='utf-8-sig').readlines(), columns=['line_str'])
LINES.index.name = 'line_num'
LINES.line_str = LINES.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()

book_id_pat = LINES.line_str.str.match(r"BOOK [\d]")
title = LINES.iloc[LINES.loc[book_id_pat].index[0]+2,0]
title = re.sub("[-]"," ",title.title())
title


'Svargarohanika Parva'

In [589]:
### Clipping the Cruft

# Using RegEx to find the placeholders for Start & End of Text
clip_pats = [
    r"(?i)^om\b",
    r"(?i)(?=.*\bend\b)(?=.*\bparv\w*)"
]

# Getting the Pattern matches for both
pat_a = LINES.line_str.str.match(clip_pats[0])
pat_b = LINES.line_str.str.match(clip_pats[1])

# Getting the line number
line_a = LINES.loc[pat_a].index[0] - 2
line_b = LINES.loc[pat_b].index[-1] - 2
LINES = LINES.loc[line_a : line_b]
LINES


Unnamed: 0_level_0,line_str
line_num,Unnamed: 1_level_1
31,(Svargarohanika Parva)
32,
33,"Om! Having bowed down into Narayana, and to Na..."
34,"also to the goddess Sarasvati, should the word..."
35,
...,...
879,"preceptors bed, or even if he be a drinker of ..."
880,"other peoples wares, or even if he be born in ..."
881,Destroying all his sins like the maker of day ...
882,"man, without doubt, sports in felicity in the ..."


In [590]:
# Pattern to detect SECTION headers or numeric lines as chapter delimiters
sec_pat = r"^\s*(?:SECTION)+"
num_pat = r"^\s*\d+\s*$"
sec_lines = LINES.line_str.str.match(sec_pat, case=True) | LINES.line_str.str.match(num_pat, case=True)

# Assign sec_num starting from 1, adding 2 to each index for unique numbering
LINES.loc[sec_lines, 'sec_num'] = [int(i + 2) for i in range(LINES.loc[sec_lines].shape[0])]

# Forward fill sec_num to apply it to all lines within that chapter
LINES.sec_num = LINES.sec_num.ffill()

# Set lines before the first SECTION as Chapter 1
LINES.loc[:LINES.loc[sec_lines].index[0], "sec_num"] = 1

# Remove the SECTION headers and numeric-only lines from the content
LINES = LINES.loc[~sec_lines]

# Make sure sec_num is integer type
LINES.sec_num = LINES.sec_num.astype(int)

# Extract Parva name from lines like "(Sabhakriya Parva)"
# chap_pat = r"\(([^()]*?Parva)\)"
chap_pat = r"\s*([^()]*?\s+parva)\)$"
LINES['chap_name'] = LINES.line_str.str.extract(chap_pat, flags=re.IGNORECASE, expand=False)

# Forward-fill Parva name across all lines
LINES['chap_name'] = LINES['chap_name'].ffill()
# LINES['chap_num'] = (LINES['chap_name'].notna()).cumsum()

# Group by chapter and concatenate lines into a single string
CHAPS = LINES.groupby(['sec_num'])\
             .line_str.apply(lambda x: '\n'.join(x))\
             .to_frame('sec_str')

# Clean trailing/leading whitespace
CHAPS['sec_str'] = CHAPS['sec_str'].str.strip()

# Add chap_name to each chapter (first Parva seen in the chapter)
CHAPS['chap_name'] = LINES.groupby('sec_num')['chap_name'].first().values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  LINES.sec_num = LINES.sec_num.astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  LINES['chap_name'] = LINES.line_str.str.extract(chap_pat, flags=re.IGNORECASE, expand=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  LINES['chap_name'] = LINES['chap_name'].ffill()


In [591]:
LINES

Unnamed: 0_level_0,line_str,sec_num,chap_name
line_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
31,(Svargarohanika Parva),1,Svargarohanika Parva
32,,1,Svargarohanika Parva
33,"Om! Having bowed down into Narayana, and to Na...",1,Svargarohanika Parva
34,"also to the goddess Sarasvati, should the word...",1,Svargarohanika Parva
35,,1,Svargarohanika Parva
...,...,...,...
879,"preceptors bed, or even if he be a drinker of ...",6,Svargarohanika Parva
880,"other peoples wares, or even if he be born in ...",6,Svargarohanika Parva
881,Destroying all his sins like the maker of day ...,6,Svargarohanika Parva
882,"man, without doubt, sports in felicity in the ...",6,Svargarohanika Parva


In [592]:
CHAPS['chap_name'].unique()

array(['Svargarohanika Parva'], dtype=object)