In [1]:
import json
import re
from collections import Counter

def extract_text(data):
    """
    Recursively extract text from JSON data.
    Handles dictionaries, lists, and strings.
    """
    if isinstance(data, dict):
        texts = [extract_text(value) for value in data.values()]
        return " ".join(texts)
    elif isinstance(data, list):
        texts = [extract_text(item) for item in data]
        return " ".join(texts)
    elif isinstance(data, str):
        return data
    else:
        return ""

def count_words_in_json(file_path):
    # Open and load the JSON file
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Extract all text from the JSON data
    text = extract_text(data)
    
    # Find words using a regex (case insensitive)
    words = re.findall(r'\w+', text.lower())
    
    # Count words using Counter
    return Counter(words)

if __name__ == '__main__':
    file_path = 'train_as_test.json'  # The JSON file to be processed
    word_counts = count_words_in_json(file_path)
    
    # Rank words by frequency (highest first)
    ranked_words = word_counts.most_common()
    
    # Print the ranked words and their counts
    for word, count in ranked_words:
        print(f'{word}: {count}')


the: 21135
of: 9421
a: 6609
to: 6493
that: 3451
in: 3183
is: 2851
if: 2721
or: 2304
not: 2299
for: 2130
article: 2029
and: 1952
may: 1884
person: 1772
by: 1409
an: 1404
with: 1318
obligation: 1262
be: 1202
has: 1189
1: 1139
party: 1132
obligor: 1018
2: 1008
act: 969
claim: 966
contract: 942
on: 884
as: 856
other: 850
paragraph: 818
from: 813
preceding: 795
s: 766
performance: 763
obligee: 760
provisions: 748
principal: 748
b: 737
time: 719
thing: 718
right: 704
y: 686
does: 671
intention: 660
provided: 642
this: 640
apply: 630
any: 625
against: 621
it: 595
n: 594
third: 587
however: 538
subject: 534
demand: 500
same: 497
3: 493
due: 493
at: 491
respect: 482
period: 462
cases: 457
referred: 449
case: 443
manifestation: 419
land: 418
when: 415
which: 410
i: 407
damage: 404
have: 400
perform: 384
must: 374
under: 373
such: 372
without: 366
who: 356
matter: 351
after: 347
mortgage: 342
even: 325
are: 313
statutory: 310
property: 310
within: 307
where: 304
rights: 301
x: 299
made: 298
shall