In [1]:
import pandas as pd

# Load the parquet file
df = pd.read_parquet('datasets/train-00000-of-00001.parquet')

# Display the first 10 rows
print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst 10 rows:")
display(df.head(10))

Dataset shape: (12338, 4)

Columns: ['id', 'url', 'title', 'text']

First 10 rows:


Unnamed: 0,id,url,title,text
0,1030,https://as.wikipedia.org/wiki/%E0%A6%AE%E0%A7%...,মুম্বাই,"মুম্বাই (পূৰ্বতে বম্বে বা বোম্বাই, . ) ভাৰতৰ অ..."
1,1525,https://as.wikipedia.org/wiki/%E0%A6%85%E0%A6%...,অসম,অসম () ভাৰতৰ উত্তৰ-পূবত অৱস্থিত সাংস্কৃতিক আৰু...
2,1650,https://as.wikipedia.org/wiki/%E0%A6%A4%E0%A7%...,তেজপুৰ,তেজপুৰ () শোণিতপুৰ জিলাৰ এখন প্ৰধান নগৰ আৰু সদ...
3,1653,https://as.wikipedia.org/wiki/%E0%A6%AA%E0%A7%...,প্ৰকৃত যীশু গীৰ্জা,প্ৰকৃত যীশু গীৰ্জা বা সত্য যীশু গীৰ্জা (The Tr...
4,1886,https://as.wikipedia.org/wiki/%E0%A6%B8%E0%A6%...,সাহিত্য,"লিখন-শিল্পক এক কথাত সাহিত্য () বোলা হয়। গদ্য,..."
5,1887,https://as.wikipedia.org/wiki/%E0%A6%AC%E0%A7%...,বেটুপাত,{| border=0
6,1996,https://as.wikipedia.org/wiki/%E0%A6%86%E0%A6%...,আফ্ৰিকা,আফ্ৰিকা () মাটিকালি আৰু জনসংখ্যাৰ দিশৰ পৰা বিশ...
7,1997,https://as.wikipedia.org/wiki/%E0%A6%86%E0%A6%...,আমেৰিকা (মহাদেশ),"আমেৰিকা মহাদেশ, উত্তৰ আমেৰিকা আৰু দক্ষিণ আমেৰি..."
8,1998,https://as.wikipedia.org/wiki/%E0%A6%8F%E0%A6%...,এণ্টাৰ্কটিকা,এণ্টাৰ্কটিকা পৃথিৱী এখন মহাদেশ। কুমেৰু অৰ্থাৎ ...
9,1999,https://as.wikipedia.org/wiki/%E0%A6%8F%E0%A6%...,এছিয়া,এছিয়া পৃথিৱীৰ বৃহত্তম তথা সৱাতোকৈ জনবহুল মহাদ...


In [2]:
import os, re

OUTPUT_DIR = 'datasets/training'
FILE1 = OUTPUT_DIR + '/train-01.txt'

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
# Function to clean text (should be defined before its use in cell below)
def clean_text(text):
    if not isinstance(text, str):
        return ""
        
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove control characters
    text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
    
    # Remove special characters (keep basic punctuation)
    text = re.sub(r'[^\w\s.,!?"\'-:;()\[\]{}]', '', text)
    
    return text.strip()

In [5]:
import numpy as np

sentence_lengths = []
max_len_words = 0
min_len_words = float('inf')
max_len_chars = 0
min_len_chars = float('inf')
total_sentences = 0

# Read and process the file line by line
for line in df['text']:
    line = line.strip()
    if line:
        total_sentences += 1

        # Calculate lengths
        word_count = len(line.split())
        char_count = len(line)

        # Store sentence lengths
        sentence_lengths.append(word_count)

        # Update max and min lengths
        max_len_words = max(max_len_words, word_count)
        min_len_words = min(min_len_words, word_count)
        max_len_chars = max(max_len_chars, char_count)
        min_len_chars = min(min_len_chars, char_count)

# Calculate statistics
average_length = np.mean(sentence_lengths)
std_dev = np.std(sentence_lengths)

# Print the results
print("\nDataset Analysis Results:")
print(f"Total Sentences: {total_sentences}")
print(f"Max Sentence Length: {max_len_words} words ({max_len_chars} characters)")
print(f"Min Sentence Length: {min_len_words} words ({min_len_chars} characters)")
print(f"Average Sentence Length: {average_length:.2f} words")
print(f"Standard Deviation: {std_dev:.2f} words")



Dataset Analysis Results:
Total Sentences: 12338
Max Sentence Length: 11664 words (80075 characters)
Min Sentence Length: 1 words (10 characters)
Average Sentence Length: 409.40 words
Standard Deviation: 524.77 words


In [5]:
# First, clean all texts
print("Cleaning texts...")
cleaned_texts = [clean_text(text) for text in df["text"]]

# Filter out empty or very short texts
cleaned_texts = [text for text in cleaned_texts if len(text) > 30]
print(f"After cleaning, {len(cleaned_texts)} texts remain (removed {len(df) - len(cleaned_texts)} short/empty texts)")

# Save all cleaned texts with a newline separator
with open(FILE1, 'w', encoding='utf-8') as f:
    for text in cleaned_texts:
        f.write(text + '\n')

print(f"Cleaned text data saved to {FILE1}")
print(f"File size: {os.path.getsize(FILE1) / (1024 * 1024):.2f} MB")

Cleaning texts...
After cleaning, 12320 texts remain (removed 18 short/empty texts)
Cleaned text data saved to datasets/training/train-01.txt
File size: 55.75 MB
