## Sample the json Corpus

In [7]:
import json
import random
import re

# Path to your large JSON file
file_path = 'lit-h.json'

# Set the random seed for reproducibility
random.seed(42)

# Total records in the file
total_records = 17000000

# Desired sample size
sample_size = 500000

# Regular expressions for Hindi text, URLs, hashtags, mentions, and emojis
hindi_pattern = re.compile(r'[\u0900-\u097F]')  # Hindi (Devanagari script) characters
url_pattern = re.compile(r'http\S+')
hashtag_pattern = re.compile(r'#\w+')
mention_pattern = re.compile(r'@\w+')
emoji_pattern = re.compile(
    "[\U0001F600-\U0001F64F]"  # Emoticons
    "|[\U0001F300-\U0001F5FF]"  # Miscellaneous symbols and pictographs
    "|[\U0001F680-\U0001F6FF]"  # Transport and map symbols
    "|[\U0001F1E0-\U0001F1FF]"  # Flags (iOS)
    , flags=re.UNICODE)

# Step 1: Generate 250k unique random indices
random_indices = sorted(random.sample(range(total_records), sample_size))

# Step 2: Function to filter based on Hindi/English content
def contains_hindi(text):
    # Check if there is Hindi content (Devanagari script)
    has_hindi = bool(hindi_pattern.search(text))
    
    # Clean the text by removing URLs, hashtags, mentions, and emojis
    cleaned_text = re.sub(url_pattern, '', text)
    cleaned_text = re.sub(hashtag_pattern, '', cleaned_text)
    cleaned_text = re.sub(mention_pattern, '', cleaned_text)
    cleaned_text = re.sub(emoji_pattern, '', cleaned_text)
    
    # Check if the remaining text is purely English (only ASCII characters)
    is_only_english = all(ord(c) < 128 for c in cleaned_text.strip())
    
    # Accept the tweet if it has Hindi content or if cleaned text isn't just English
    return has_hindi and not is_only_english

# Step 3: Open the file and extract records at the random indices
sampled_records = []
current_index = 0  # Index while reading the file
random_index_pointer = 0  # Pointer to the current random index

with open(file_path, 'r') as f:
    for line in f:
        # If current index matches the random index, extract the record
        if current_index == random_indices[random_index_pointer]:
            record = json.loads(line)
            tweet = record.get('tweet', '')

            # Filter based on Hindi and non-English content
            if contains_hindi(tweet):
                sampled_records.append(tweet)
            
            # Move to the next random index
            random_index_pointer += 1

            # Break if we have enough sampled records
            if random_index_pointer >= len(random_indices):
                break

        # Increment the current line index
        current_index += 1

print(f"Total sampled records: {len(sampled_records)}")

# Step 4: Save the sampled records to a new file with proper encoding
with open('tweet_hindi_228k.josn', 'w', encoding='utf-8') as outfile:
    for record in sampled_records:
        json.dump(record, outfile, ensure_ascii=False)
        outfile.write('\n')


Total sampled records: 288175


## Load the Sampled Json

In [10]:
import json

# Path to the saved sample file
sample_file_path = 'tweet_hindi_228k.json'

# List to hold the loaded records
loaded_sampled_records = []

# Read the file line by line and load each record
with open(sample_file_path, 'r', encoding='utf-8') as infile:
    for line in infile:
        record = json.loads(line)
        loaded_sampled_records.append(record)

# Now `loaded_sampled_records` contains all the sampled records
print(f"Total loaded records: {len(loaded_sampled_records)}")


Total loaded records: 288175


In [11]:
loaded_sampled_records[:10]

['Aisa hai to glt baat ho rhi hai ‡§∏‡§æ‡§∞‡•Ä ‡§à‡§µ‡•Ä‡§è‡§Æ ‡§Æ‡§∂‡•Ä‡§®‡•á ‡§è‡§Ç‡§°‡•ç‡§∞‡•â‡§á‡§° ‡§´‡•ã‡§® ‡§∏‡•á ‡§ï‡§®‡•á‡§ï‡•ç‡§ü ‡§π‡•ã ‡§∞‡§π‡•Ä ‡§π‡•à, ‡§á‡§∏‡§ï‡§æ ‡§Æ‡§§‡§≤‡§¨ ‡§¨‡•Ä‡§ú‡•á‡§™‡•Ä ‡§ú‡•Ä‡§§ ‡§ö‡•Å‡§ï‡•Ä ‡§π‡•à !  http://dhunt.in/57FS0?s=a&ss=twt\xa0‚Ä¶ via Dailyhunt',
 'Sir,  ‡§Ü‡§ú‡§ï‡§≤ ‡§ú‡§¨ ‡§≠‡•Ä ‡§Æ‡•á‡§Ç ‡§Ø‡•á ‡§¶‡•á‡§ñ‡§§‡§æ ‡§π‡•Ç‡§Ç ‡§®‡§æ ‡§ú‡§æ‡§®‡•á ‡§ï‡§ø‡§â ‡§Ü‡§™ ‡§Ø‡§æ‡§¶ ‡§Ü‡§§‡•á ‡§π‡•à, ‡§¨‡§æ‡§§‡§æ ‡§∏‡§ï‡§§‡•á ‡§π‡•à ‡§ï‡§ø‡§â??ü§îü§î pic.twitter.com/QBRZANmr4g',
 '‡§ú‡§æ‡§ó‡•ã‡§Ç ‡§∞‡•á ‡§≠‡§æ‡§∞‡§§ ‡§ú‡§æ‡§ó‡•ã‡§Ç:  *‡§¶‡•á‡§∂‡§µ‡§æ‡§∏‡§ø‡§Ø‡•ã‡§Ç ‡§≠‡•Ä‡§°‡§º ‡§ú‡•Å‡§ü‡§æ‡§ï‡§∞ ‡§°‡§∞‡§æ‡§®‡•á ‡§ï‡§æ ‡§¨‡§π‡§æ‡§®‡§æ ‡§¢‡•Ç‡§Ç‡§¢ ‡§≤‡§ø‡§Ø‡§æ ‡§Æ‡•Å‡§¶‡•ç‡§¶‡•ã‡§Ç ‡§∏‡•á ‡§ß‡•ç‡§Ø‡§æ‡§® ‡§π‡§ü‡§æ‡§®‡•á ‡§ï‡§æ ‡§¨‡§π‡§æ‡§®‡§æ ‡§¢‡•Ç‡§Ç‡§¢ ‡§≤‡§ø‡§Ø‡§æ ‡§ï‡§≤ ‡§§‡§ï ‡§ú‡•ã ‡§§‡§∞‡§ï‡•ç‡§ï‡•Ä ‡§ï‡•Ä ‡§¨‡§æ‡§§ ‡§ï‡§∞‡§§‡•á ‡§•‡•á ‡§ö‡•Å‡§®‡§æ‡§µ ‡§Ü‡§§‡•á ‡§π‡•Ä ‡§Æ‡§®‡•ç‡§¶‡§ø‡§∞ ‡§ï‡§æ ‡§¨‡§π‡§æ‡§®‡§æ ‡§¢‡•Ç‡§Ç‡§¢ ‡§≤‡§ø‡§Ø‡§æ ‡§π