# Script to preprocess both the available datasets
- train-01.txt  (high-quality wayy longer sentences)
- train-02.txt  (moderate-quality wayy shorter sentences)

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset, DatasetDict
import numpy as np
import os

In [None]:
MAX_SEQ_LENGTH = 1024  
tokenizer = AutoTokenizer.from_pretrained("unsloth/tinyllama-bnb-4bit")
tokenizer.model_max_length = MAX_SEQ_LENGTH

In [None]:
# Load the dataset
data_files = {
    "train1": "datasets/training/train-01.txt",
    "train2": "datasets/training/train-02.txt",
}
dataset = load_dataset("text", data_files=data_files)

## Two Methods of splitting the dataset
1. simple way, truncating data to max-seq-len
2. using sliding window and less truncation, preserving context

In [None]:
def flatten_and_split(examples):  # Use plural for batched input
    flat_texts = []
    for text in examples["text"]:  # Iterate over the list of texts in the batch
        tokenized = tokenizer(text, truncation=False, padding=False, return_tensors="pt")
        input_ids = tokenized["input_ids"][0].tolist()
        chunks = [input_ids[i:i + MAX_SEQ_LENGTH] for i in range(0, len(input_ids), MAX_SEQ_LENGTH)]
        flat_texts.extend([tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks])
    return {"text": flat_texts}

In [None]:
def sliding_window_chunking_no_padding(examples, max_seq_length=1024, overlap=512):
    """
    Tokenizes and splits texts into overlapping chunks without padding.
    
    Args:
        examples: Dictionary containing "text" field with batch of texts.
        max_seq_length: The length of each chunk.
        overlap: Number of tokens to overlap between chunks.

    Returns:
        A dictionary with tokenized and overlapping chunks.
    """
    flat_texts = []

    for text in examples["text"]:
        # Tokenize without truncation
        tokenized = tokenizer(text, truncation=False, padding=False, return_tensors="pt")
        input_ids = tokenized["input_ids"][0].tolist()

        # Sliding window chunking
        step = max_seq_length - overlap
        chunks = [input_ids[i:i + max_seq_length] for i in range(0, len(input_ids), step)]

        # Decode without padding
        flat_texts.extend([tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks])

    return {"text": flat_texts}

In [None]:
# Process each split
print("Processing train1...")
processed_train1 = dataset["train1"].map(sliding_window_chunking_no_padding, batched=True, remove_columns=["text"])

In [None]:
print("Processing train2...")
processed_train2 = dataset["train2"].map(sliding_window_chunking_no_padding, batched=True, remove_columns=["text"])

In [None]:
dataset_processed = DatasetDict({
    "processed_train1": processed_train1,
    "processed_train2": processed_train2,
})

## Tokenize processed dataset and check stats

In [None]:
# Function to tokenize and get lengths
def get_sequence_lengths(examples):
    # Tokenize with return_length=True, no truncation/padding yet
    tokenized = tokenizer(examples["text"], return_length=True, truncation=False)
    return {"length": tokenized["length"]}

# Apply the function to the dataset
length_dataset = dataset_processed.map(get_sequence_lengths, batched=True)

In [None]:
# Extract lengths for each split
train1_lengths = length_dataset["processed_train1"]["length"]
train2_lengths = length_dataset["processed_train2"]["length"]

# Combine lengths for overall analysis
all_lengths = train1_lengths + train2_lengths

# Print lengths
print("Train1 lengths:", len(train1_lengths))
print("Train2 lengths:", len(train2_lengths))
print("All lengths:", len(all_lengths))

# Compute statistics of both and combined
train1_array = np.array(train1_lengths)
train2_array = np.array(train2_lengths)
all_array = np.array(all_lengths)

train1_stats = {
    "min": np.min(train1_array),
    "max": np.max(train1_array),
    "mean": np.mean(train1_array),
    "median": np.median(train1_array),
    "percentile_90": np.percentile(train1_array, 90),
    "percentile_95": np.percentile(train1_array, 95),
    "percentile_99": np.percentile(train1_array, 99),
}

train2_stats = {
    "min": np.min(train2_array),
    "max": np.max(train2_array),
    "mean": np.mean(train2_array),
    "median": np.median(train2_array),
    "percentile_90": np.percentile(train2_array, 90),
    "percentile_95": np.percentile(train2_array, 95),
    "percentile_99": np.percentile(train2_array, 99),
}

all_stats = {
    "min": np.min(all_array),
    "max": np.max(all_array),
    "mean": np.mean(all_array),
    "median": np.median(all_array),
    "percentile_90": np.percentile(all_array, 90),
    "percentile_95": np.percentile(all_array, 95),
    "percentile_99": np.percentile(all_array, 99),
}

# Print results
print("\nSequence Length Statistics of train-01:")
for key, value in train1_stats.items():
    print(f"{key}: {value:.2f}")

print("\nSequence Length Statistics of train-02:")
for key, value in train2_stats.items():
    print(f"{key}: {value:.2f}")

print("\nSequence Length Statistics of all:")
for key, value in all_stats.items():
    print(f"{key}: {value:.2f}")

### Some cool plots, yaay!

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm

# # Simulated sequence length arrays (replace these with your actual data)
# train1_lengths = np.random.normal(867, 150, 78495)  # Replace with actual lengths
# train2_lengths = np.random.normal(140, 50, 177647)  # Replace with actual lengths
# all_lengths = np.concatenate([train1_lengths, train2_lengths])

# --- Plot Settings ---
fig, axes = plt.subplots(3, 3, figsize=(18, 15))
fig.suptitle('Dataset Sequence Length Analysis', fontsize=20)

# --- Histogram + Normal Distribution ---
def plot_histogram_with_normal(ax, data, title):
    sns.histplot(data, bins=50, kde=False, ax=ax, color='skyblue', stat='density', label='Actual Distribution')
    
    # Fit and plot a normal distribution
    mu, std = norm.fit(data)
    x = np.linspace(min(data), max(data), 100)
    p = norm.pdf(x, mu, std)
    
    ax.plot(x, p, 'r-', label=f'Normal Fit (μ={mu:.2f}, σ={std:.2f})')
    ax.set_title(title)
    ax.set_xlabel('Sequence Length')
    ax.set_ylabel('Density')
    ax.legend()

# --- Boxplot ---
def plot_boxplot(ax, data, title):
    sns.boxplot(x=data, ax=ax, color='lightblue')
    ax.set_title(title)
    ax.set_xlabel('Sequence Length')

# --- KDE Plot ---
def plot_kde(ax, data, title):
    sns.kdeplot(data, fill=True, ax=ax, color='lightgreen')
    ax.set_title(title)
    ax.set_xlabel('Sequence Length')

# --- Combined Plotting ---
datasets = [
    (train1_lengths, "Train-01"),
    (train2_lengths, "Train-02"),
    (all_lengths, "Combined")
]

for i, (data, name) in enumerate(datasets):
    # Histogram + Normal
    plot_histogram_with_normal(axes[i, 0], data, f"{name} - Histogram with Normal Fit")
    
    # Boxplot
    plot_boxplot(axes[i, 1], data, f"{name} - Boxplot")
    
    # KDE Plot
    plot_kde(axes[i, 2], data, f"{name} - KDE Plot")

plt.tight_layout(rect=[0, 0, 1, 0.98])
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm


# --- Normal Distributions Config ---
mean_center = 1024
std_devs = [100, 300, 500]  # Different spreads
x = np.linspace(0, 1500, 1000)  # X-axis range

# --- Plotting ---
plt.figure(figsize=(14, 8))

# Actual Dataset Distribution
sns.kdeplot(train1_lengths, label='Train-01', fill=True, color='skyblue', alpha=0.6)
sns.kdeplot(train2_lengths, label='Train-02', fill=True, color='lightgreen', alpha=0.6)
sns.kdeplot(all_lengths, label='Combined', fill=True, color='coral', alpha=0.6)

# Multiple Normal Distributions
scaling_factor = len(all_lengths) * 0.005  # Scale the normal distribution properly
for std_dev in std_devs:
    normal_dist = norm.pdf(x, mean_center, std_dev)
    plt.plot(x, normal_dist * scaling_factor, label=f'Normal Dist. (μ=1024, σ={std_dev})', linewidth=2)

# Labels and Legends
plt.title('Dataset Distribution vs. Multiple Normal Distributions (MAX_SEQ_LENGTH = 1024)', fontsize=18)
plt.xlabel('Sequence Length', fontsize=14)
plt.ylabel('Density', fontsize=14)
plt.axvline(mean_center, color='black', linestyle='--', label='Max Seq Length (1024)')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(14, 8))

# Plotting your actual data
sns.kdeplot(train1_lengths, label='Train-01', fill=True, color='skyblue', alpha=0.6)
sns.kdeplot(train2_lengths, label='Train-02', fill=True, color='lightgreen', alpha=0.6)
sns.kdeplot(all_lengths, label='Combined', fill=True, color='coral', alpha=0.6)

# Multiple Normal Distributions
for std_dev in [100, 300, 500]:
    normal_dist = norm.pdf(x, mean_center, std_dev)
    plt.plot(x, normal_dist * 2000, label=f'Normal Dist. (μ=1024, σ={std_dev})', linewidth=2)

# Labels and Legends
plt.title('Dataset Distribution vs. Multiple Normal Distributions (MAX_SEQ_LENGTH = 1024)', fontsize=18)
plt.xlabel('Sequence Length', fontsize=14)
plt.ylabel('Density', fontsize=14)
plt.axvline(mean_center, color='black', linestyle='--', label='Max Seq Length (1024)')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Optional: Histogram for visualization (requires matplotlib)
import matplotlib.pyplot as plt

# Plotting histograms with individual ranges
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Train1 Histogram
axes[0].hist(train1_array, bins=50, range=(0, max(2048, train1_stats["max"])), color='blue', alpha=0.7)
axes[0].set_title("Train1 Sequence Lengths")
axes[0].set_xlabel("Length (tokens)")
axes[0].set_ylabel("Frequency")

# Train2 Histogram
axes[1].hist(train2_array, bins=50, range=(0, max(2048, train2_stats["max"])), color='green', alpha=0.7)
axes[1].set_title("Train2 Sequence Lengths")
axes[1].set_xlabel("Length (tokens)")
axes[1].set_ylabel("Frequency")

# All Array Histogram
axes[2].hist(all_array, bins=50, range=(0, max(2048, all_stats["max"])), color='orange', alpha=0.7)
axes[2].set_title("All Array Sequence Lengths")
axes[2].set_xlabel("Length (tokens)")
axes[2].set_ylabel("Frequency")

# Adjust layout
plt.tight_layout()
plt.show()

## Save the Datasets

In [None]:
# processed_train1, processed_train2
# Save the processed datasets to text formats

OUTPUT_DIR = "datasets/processed"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Combine the processed train splits
combined_train = processed_train1["text"] + processed_train2["text"]

# Save as plain text
def save_text(filename, texts):
    with open(filename, "w", encoding="utf-8") as f:
        for text in texts:
            f.write(text + "\n\n")  # Separate samples with double newline

# Saving individual sets
save_text(f"{OUTPUT_DIR}/processed_train1.txt", processed_train1["text"])
save_text(f"{OUTPUT_DIR}/processed_train2.txt", processed_train2["text"])
save_text(f"{OUTPUT_DIR}/combined_train.txt", combined_train)


### OOPS!! Forgot to create validation set

- take 5% from both and combine to form val and combine rest to form train

In [None]:
val_ratio = 0.05  # 5% of the data from each for validation

# Split without shuffling → Keep natural order
val_size1 = int(len(processed_train1["text"]) * val_ratio)
train1_train = processed_train1["text"][:-val_size1]   # 90% for training
train1_val = processed_train1["text"][-val_size1:]     # 10% for validation

val_size2 = int(len(processed_train2["text"]) * val_ratio)
train2_train = processed_train2["text"][:-val_size2]   # 90% for training
train2_val = processed_train2["text"][-val_size2:]     # 10% for validation

# Combine datasets (no shuffling, context preserved)
combined_train = train1_train + train2_train
combined_val = train1_val + train2_val

# Save the datasets as plain text
def save_text(filename, texts):
    """Saves list of texts into a plain text file"""
    with open(filename, "w", encoding="utf-8") as f:
        for text in texts:
            f.write(text + "\n\n")  # Separate samples with double newline

# Save the final datasets
save_text(f"{OUTPUT_DIR}/train-F.txt", combined_train)
save_text(f"{OUTPUT_DIR}/val-F.txt", combined_val)

print(f"Train size: {len(combined_train)} samples")
print(f"Validation size: {len(combined_val)} samples")

In [None]:
# Check for duplicate lines
with open(f"{OUTPUT_DIR}/train-F.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

len(lines), len(set(lines))

In [None]:
# Load the dataset
data_comnined = {
    "train1": "datasets/processed/train-F.txt",
    "train2": "datasets/processed/val-F.txt",
}
dataset_combined = load_dataset("text", data_files=data_comnined)

In [None]:
length_dataset_combined = dataset_combined.map(get_sequence_lengths, batched=True)
# Extract lengths for each split
train_lengths = length_dataset_combined["train1"]["length"]
val_lengths = length_dataset_combined["train2"]["length"]

In [None]:
len(train_lengths), len(val_lengths)

#### More cool vis

In [None]:
# 🔥 Normal Distribution Parameters
train_mu, train_sigma = np.mean(train_lengths), np.std(train_lengths)
val_mu, val_sigma = np.mean(val_lengths), np.std(val_lengths)

# 🔥 Plotting
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# X values for the normal distribution
x = np.linspace(0, 1024, 1000)

# 🔥 Plot for Train Dataset
axes[0].plot(x, norm.pdf(x, train_mu, train_sigma), color="skyblue", label=f"Train (μ={train_mu:.2f}, σ={train_sigma:.2f})")
axes[0].axvline(1024, color='red', linestyle='--', label='Max Seq Length = 1024')
axes[0].set_title("Normal Distribution: Train Dataset")
axes[0].set_xlabel("Sequence Length (tokens)")
axes[0].set_ylabel("Probability Density")
axes[0].legend()

# 🔥 Plot for Validation Dataset
axes[1].plot(x, norm.pdf(x, val_mu, val_sigma), color="orange", label=f"Validation (μ={val_mu:.2f}, σ={val_sigma:.2f})")
axes[1].axvline(1024, color='red', linestyle='--', label='Max Seq Length = 1024')
axes[1].set_title("Normal Distribution: Validation Dataset")
axes[1].set_xlabel("Sequence Length (tokens)")
axes[1].set_ylabel("Probability Density")
axes[1].legend()

# Display the plots
plt.tight_layout()
plt.show()