In [2]:
from collections import Counter
import os
import json
import pandas as pd

# Paths setup
base_path = 'data'
wsj_path = os.path.join(base_path, 'lm_data')
wer_path = os.path.join(base_path, 'wer_data')

wsj_train_path = os.path.join(wsj_path, 'treebank-sentences-train.txt')
wsj_dev_path = os.path.join(wsj_path, 'treebank-sentences-dev.txt')
wsj_test_path = os.path.join(wsj_path, 'treebank-sentences-test.txt')
wer_dev_path = os.path.join(wer_path, 'dev_sentences.json')
wer_test_path = os.path.join(wer_path, 'test_sentences.json')

# Load WSJ data
def load_wsj_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read().splitlines()
    return data

# Load WER data
def load_wer_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    sentences = []
    for item in data.values():
        sentences.extend(item['sentences'])
    return sentences

# Function to calculate statistics
def calculate_statistics(data):
    words = ' '.join(data).split()
    vocab = Counter(words)
    vocab_size = len(vocab)
    avg_sentence_length = sum(map(len, data)) / len(data)
    avg_word_length = sum(map(len, words)) / len(words) if words else 0
    return vocab_size, avg_sentence_length, avg_word_length, len(data), len(words)

# Load data
datasets = {
    "Train": load_wsj_data(wsj_train_path),
    "Dev": load_wsj_data(wsj_dev_path) + load_wer_data(wer_dev_path),
    "Test": load_wsj_data(wsj_test_path) + load_wer_data(wer_test_path)
}

# Calculate and store statistics for each dataset
stats = {}
for name, data in datasets.items():
    stats[name] = calculate_statistics(data)

# Convert statistics to DataFrame for display
df_stats = pd.DataFrame(stats, index=["Vocabulary Size", "Avg Sentence Length", "Avg Word Length", "Total Sentences", "Total Words"]).T
print(df_stats)


       Vocabulary Size  Avg Sentence Length  Avg Word Length  Total Sentences  \
Train          32215.0           122.694603         4.947067          36261.0   
Dev            11084.0           115.984051         4.892171           5204.0   
Test           10555.0           117.934505         4.946610           5176.0   

       Total Words  
Train     754202.0  
Dev       103321.0  
Test      103522.0  


In [4]:
import os
import json
from collections import Counter
import pandas as pd
import numpy as np

# Define paths
base_path = 'data/lm_data'
wsj_train_path = os.path.join(base_path, 'treebank-sentences-train.txt')
wsj_dev_path = os.path.join(base_path, 'treebank-sentences-dev.txt')
wsj_test_path = os.path.join(base_path, 'treebank-sentences-test.txt')

# Define special tokens
UNK_TOKEN = "<UNK>"

# Function to load and preprocess data
def load_and_preprocess_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.read().splitlines()
    # Lowercase and tokenize
    data = [line.lower().split() for line in lines]
    return data

# Function to build vocabulary and handle unknowns
def build_vocab(data):
    vocab_counter = Counter([token for sentence in data for token in sentence])
    vocab = {word: i for i, (word, _) in enumerate(vocab_counter.items(), start=1)}
    vocab[UNK_TOKEN] = 0  # Reserve index 0 for UNK
    return vocab, vocab_counter

# Function to replace unknown tokens
def replace_unknowns(data, vocab):
    return [[token if token in vocab else UNK_TOKEN for token in sentence] for sentence in data]

# Load and preprocess data
train_data = load_and_preprocess_data(wsj_train_path)
dev_data = load_and_preprocess_data(wsj_dev_path)
test_data = load_and_preprocess_data(wsj_test_path)

# Build vocab from train data only and get the word count
vocab, word_count = build_vocab(train_data)

# Replace unknowns in all datasets
train_data_processed = replace_unknowns(train_data, vocab)
dev_data_processed = replace_unknowns(dev_data, vocab)
test_data_processed = replace_unknowns(test_data, vocab)

# Calculate statistics
def calculate_statistics(data):
    total_words = sum(len(sentence) for sentence in data)
    total_sentences = len(data)
    avg_sentence_length = total_words / total_sentences
    word_lengths = [len(word) for sentence in data for word in sentence]
    avg_word_length = np.mean(word_lengths)
    return total_sentences, total_words, avg_sentence_length, avg_word_length

stats = {
    "Dataset": ["Train", "Dev", "Test"],
    "Total Sentences": [],
    "Total Words": [],
    "Average Sentence Length": [],
    "Average Word Length": [],
    "Vocabulary Size": len(vocab) - 1  # Excluding <UNK> token
}

for dataset in [train_data_processed, dev_data_processed, test_data_processed]:
    total_sentences, total_words, avg_sentence_length, avg_word_length = calculate_statistics(dataset)
    stats["Total Sentences"].append(total_sentences)
    stats["Total Words"].append(total_words)
    stats["Average Sentence Length"].append(avg_sentence_length)
    stats["Average Word Length"].append(avg_word_length)

# Convert stats to DataFrame for display
df_stats = pd.DataFrame(stats)
print(df_stats)


  Dataset  Total Sentences  Total Words  Average Sentence Length  \
0   Train            36261       754202                20.799261   
1     Dev             4529        93775                20.705454   
2    Test             4554        94230                20.691700   

   Average Word Length  Vocabulary Size  
0             4.947067            27774  
1             4.861829            27774  
2             4.875348            27774  
