In [None]:
import re
from datasets import load_dataset
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
ABBREVIATIONS = {
    "Dr.", "Mr.", "Mrs.", "Ms.", "Prof.", "Sr.", "Jr.", "St.", "vs.", "etc.", "e.g.", "i.e.", "U.S.", "U.K.", "Ph.D.", "B.Sc.", "M.Sc.",
    'ડો.', 'ડૉ.', 'પ્રો.', 'શ્રી.', 'સુશ્રી.', 'શ્રીમતી.', 'તા.', 'વગેરે.', 'ઇ.સ.', 'એલ.ટી.', 'વિ.સ.', 'મા.', 'સા.', 'પ્રા.', 'મુ.', 'ના.', 
    "શ્રીએ.", "મી.", "પ્રોફ.", "એમ.એ.", "બી.એ.", "એમ.બી.બી.એસ.", "પી.એચ.ડી.", "પી.એમ.", "એ.કે.", "એમ.કે."
}

def gujarati_sentence_tokenizer(text):
    sentence_endings = r'([\.\?\!\।](?!\d|\.|[a-zA-Z]+|[\u0A80-\u0AFF]))'
    parts = re.split(sentence_endings, text)
    chunks = []
    
    for i in range(0, len(parts) - 1, 2):
        sentence = parts[i].strip() + parts[i + 1]
        chunks.append(sentence)
    
    i = 0
    while i < len(chunks):
        current = chunks[i]
        if any(current.endswith(abbr) for abbr in ABBREVIATIONS) and i + 1 < len(chunks):
            current += " " + chunks[i + 1]
            i += 1
        yield current
        i += 1

In [5]:
def gujarati_word_tokenizer(sentence):
    sentence = re.sub(r'\s+', ' ', sentence.strip())

    url_pattern = r'https?://\S+|www\.\S+'
    email_pattern = r'\b[\w\.-]+@[\w\.-]+\.\w+\b'
    date_pattern = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{1,2}(?:st|nd|rd|th)?\s+\w+\s+\d{4}\b'
    number_pattern = r'\b\d+(?:[\.,]\d+)?\b'
    full_pattern = re.compile(
        f'{url_pattern}|{email_pattern}|{date_pattern}|{number_pattern}|[a-zA-Z]+|[\u0A80-\u0AFF]+|[^\w\s]',
        re.UNICODE
    )
    words = re.findall(full_pattern, sentence)
    return words


In [6]:
dataset = load_dataset(
    "ai4bharat/IndicCorpV2",
    split="guj_Gujr",
    streaming=True
)

parquet_file = "gujarati_rawtext.parquet"
writer = None
batch_size = 100000
buffer = []
cnt=0
for i, example in enumerate(dataset):
    if(cnt>100):
        break
    if 'text' in example:
        buffer.append({"text": example["text"]})
    
    if len(buffer) >= batch_size:
        df = pd.DataFrame(buffer)
        table = pa.Table.from_pandas(df)
        
        if writer is None:
            writer = pq.ParquetWriter(parquet_file, table.schema, compression="snappy")
        
        writer.write_table(table)
        buffer.clear()
        cnt+=1
    
if buffer:
    df = pd.DataFrame(buffer)
    table = pa.Table.from_pandas(df)
    if writer is None:
        writer = pq.ParquetWriter(parquet_file, table.schema, compression="snappy")
    writer.write_table(table)

if writer:
    writer.close()

'HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.' thrown while requesting GET https://huggingface.co/datasets/ai4bharat/IndicCorpV2/resolve/2d7285e6ce14fdb3fb2449c9f89427b9f582ac3f/data/gu.txt
Retrying in 1s [Retry 1/5].


ChunkedEncodingError: ('Connection broken: IncompleteRead(1538889 bytes read, 25850803 more expected)', IncompleteRead(1538889 bytes read, 25850803 more expected))

In [None]:
df_raw = pd.read_parquet("gujarati_rawtext.parquet", engine="pyarrow")
parquet_file = "gujarati_sentence_tokenized.parquet"
writer = None
batch_size = 100000
buffer = []

for idx, row in df_raw.iterrows():
    text = row["text"]
    for sent in gujarati_sentence_tokenizer(text):
        buffer.append({
            "sentence": sent
        })
        if len(buffer) >= batch_size:
            df_batch = pd.DataFrame(buffer)
            table = pa.Table.from_pandas(df_batch)
            if writer is None:
                writer = pq.ParquetWriter(parquet_file, table.schema, compression="snappy")
            writer.write_table(table)
            buffer.clear()

if buffer:
    df_batch = pd.DataFrame(buffer)
    table = pa.Table.from_pandas(df_batch)
    if writer is None:
        writer = pq.ParquetWriter(parquet_file, table.schema, compression="snappy")
    writer.write_table(table)

if writer:
    writer.close()

In [None]:
df_raw = pd.read_parquet("gujarati_rawtext.parquet", engine="pyarrow")
parquet_file = "gujarati_word_tokenized.parquet"
writer = None
batch_size = 100000
buffer = []

for idx, row in df_raw.iterrows():
    text = row["text"]
    for sent in gujarati_word_tokenizer(text):
        buffer.append({
            "sentence": sent
        })
        if len(buffer) >= batch_size:
            df_batch = pd.DataFrame(buffer)
            table = pa.Table.from_pandas(df_batch)
            if writer is None:
                writer = pq.ParquetWriter(parquet_file, table.schema, compression="snappy")
            writer.write_table(table)
            buffer.clear()

if buffer:
    df_batch = pd.DataFrame(buffer)
    table = pa.Table.from_pandas(df_batch)
    if writer is None:
        writer = pq.ParquetWriter(parquet_file, table.schema, compression="snappy")
    writer.write_table(table)

if writer:
    writer.close()

In [None]:
df_sentence= pd.read_parquet("gujarati_sentence_tokenized.parquet", engine="pyarrow")
df_word=pd.read_parquet("gujarati_word_tokenized.parquet", engine="pyarrow")
total_sentences = len(df_sentence)
total_words = len(df_word)
total_chars =0
for i in df_word['sentence']:
    total_chars+=len(i)
avg_sentence_len = total_words / total_sentences
avg_word_len = total_chars / total_words
unique_tokens = len(set(token for token in df_word['sentence']))
ttr = unique_tokens / total_words
stats = {
    "Total Sentences": total_sentences,
    "Total Words": total_words,
    "Total Characters": total_chars,
    "Average Sentence Length": avg_sentence_len,
    "Average Word Length": avg_word_len,
    "Type/Token Ratio": ttr
}
print(stats)

NameError: name 'pd' is not defined

In [None]:
df_sentence

NameError: name 'df_sentence' is not defined