## Sample the json Corpus

In [None]:
import json
import random
import re

# Path to your large JSON file
file_path = 'lit-h.json'

# Set the random seed for reproducibility
random.seed(42)

# Total records in the file
total_records = 17000000

# Desired sample size
sample_size = 500000

# Regular expressions for Hindi text, URLs, hashtags, mentions, and emojis
hindi_pattern = re.compile(r'[\u0900-\u097F]')  # Hindi (Devanagari script) characters
url_pattern = re.compile(r'http\S+')
hashtag_pattern = re.compile(r'#\w+')
mention_pattern = re.compile(r'@\w+')
emoji_pattern = re.compile(
    "[\U0001F600-\U0001F64F]"  # Emoticons
    "|[\U0001F300-\U0001F5FF]"  # Miscellaneous symbols and pictographs
    "|[\U0001F680-\U0001F6FF]"  # Transport and map symbols
    "|[\U0001F1E0-\U0001F1FF]"  # Flags (iOS)
    , flags=re.UNICODE)

# Step 1: Generate 250k unique random indices
random_indices = sorted(random.sample(range(total_records), sample_size))

# Step 2: Function to filter based on Hindi/English content
def contains_hindi(text):
    # Check if there is Hindi content (Devanagari script)
    has_hindi = bool(hindi_pattern.search(text))
    
    # Clean the text by removing URLs, hashtags, mentions, and emojis
    cleaned_text = re.sub(url_pattern, '', text)
    cleaned_text = re.sub(hashtag_pattern, '', cleaned_text)
    cleaned_text = re.sub(mention_pattern, '', cleaned_text)
    cleaned_text = re.sub(emoji_pattern, '', cleaned_text)
    
    # Check if the remaining text is purely English (only ASCII characters)
    is_only_english = all(ord(c) < 128 for c in cleaned_text.strip())
    
    # Accept the tweet if it has Hindi content or if cleaned text isn't just English
    return has_hindi and not is_only_english

# Step 3: Open the file and extract records at the random indices
sampled_records = []
current_index = 0  # Index while reading the file
random_index_pointer = 0  # Pointer to the current random index

with open(file_path, 'r') as f:
    for line in f:
        # If current index matches the random index, extract the record
        if current_index == random_indices[random_index_pointer]:
            record = json.loads(line)
            tweet = record.get('tweet', '')

            # Filter based on Hindi and non-English content
            if contains_hindi(tweet):
                sampled_records.append(tweet)
            
            # Move to the next random index
            random_index_pointer += 1

            # Break if we have enough sampled records
            if random_index_pointer >= len(random_indices):
                break

        # Increment the current line index
        current_index += 1

print(f"Total sampled records: {len(sampled_records)}")

# Step 4: Save the sampled records to a new file with proper encoding
with open('tweet_hindi_228k.josn', 'w', encoding='utf-8') as outfile:
    for record in sampled_records:
        json.dump(record, outfile, ensure_ascii=False)
        outfile.write('\n')


## Load the Sampled Json

In [None]:
import json

# Path to the saved sample file
sample_file_path = 'tweet_hindi_228k.json'

# List to hold the loaded records
loaded_sampled_records = []

# Read the file line by line and load each record
with open(sample_file_path, 'r', encoding='utf-8') as infile:
    for line in infile:
        record = json.loads(line)
        loaded_sampled_records.append(record)

# Now `loaded_sampled_records` contains all the sampled records
print(f"Total loaded records: {len(loaded_sampled_records)}")


In [None]:
loaded_sampled_records[:10]

## Nepberta Dataset

In [None]:
!gdown 1t8TL86ozNi4YabpnfThjxRhbaYBg8F0P

In [None]:
!unzip clean_nepberta_data.zip

In [20]:
import pandas as pd

In [None]:
df = pd.read_csv('clean_date_categories.csv')

In [22]:
politics = df[df['clean_categories'] == 'politics']

In [33]:
df.sample(n=1000000, random_state=42).to_csv('50lakh samples.csv')

In [None]:
df.shape

In [None]:
df.clean_categories.value_counts()

In [None]:
df[df['clean_categories'] == 'society'].sample(n=100).text

In [1]:
!pip install huggingface_hub



In [2]:
!export HF_TOKEN=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

In [3]:
pip install huggingface_hub datasets

Note: you may need to restart the kernel to use updated packages.


In [4]:
from datasets import load_dataset

# Load from a local CSV file
dataset = load_dataset("csv", data_files="/home/suman/CHIPSAL-COLING-2025/50lakh samples.csv")

In [5]:
dataset.push_to_hub("sumanpaudel1997/tweet_data_5gb", private=False)

Uploading the dataset shards:   0%|          | 0/11 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/91 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/91 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/91 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/91 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/91 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/91 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/91 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/91 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/91 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/91 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/91 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/sumanpaudel1997/tweet_data_5gb/commit/42b101b0c4819ee7b7151f8a75ec97d08e9967e3', commit_message='Upload dataset', commit_description='', oid='42b101b0c4819ee7b7151f8a75ec97d08e9967e3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/sumanpaudel1997/tweet_data_5gb', endpoint='https://huggingface.co', repo_type='dataset', repo_id='sumanpaudel1997/tweet_data_5gb'), pr_revision=None, pr_num=None)

In [13]:
import pandas as pd

df = pd.read_json('tweet_hindi_288k.json', lines=True)
df.columns = ['text']

In [18]:
import emoji

def encode_emoji(text):
    return emoji.demojize(text)
df['text'] = df['text'].apply(encode_emoji)
df.

Unnamed: 0,text
0,Aisa hai to glt baat ho rhi hai सारी ईवीएम मशी...
1,"Sir, आजकल जब भी में ये देखता हूं ना जाने किउ ..."
2,जागों रे भारत जागों: *देशवासियों भीड़ जुटाकर ...
3,@myogiadityanath स्वामी जी 68500 2019 भर्ती म...
4,महाराष्ट्र मे अब 68% आरक्षण होने वाला अब परिव...
...,...
288170,Retweeted चाचा अभय (@Roflchacha): पेट्रोंल पं...
288171,इसीलिए कहा कि न्यूज चैनल देखना बंद करें और आमज...
288172,बिधूना बिजली विभाग के सरकारी कार्यलय मैं लाखों...
288173,पूर्व क्रिकेटर जयसूर्या ने लीक किया Ex-Wife के...


In [19]:
df.to_csv('tweet_hindi_288k.csv', index=False)


In [20]:
dataset = load_dataset("csv", data_files="/home/suman/CHIPSAL-COLING-2025/tweet_hindi_288k.csv")
dataset.push_to_hub("sumanpaudel1997/hindi_tweet_sampled_dataset", private=False)


Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/289 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/sumanpaudel1997/hindi_tweet_sampled_dataset/commit/353d7809c62dbcb95b2c1c915b05a993b4d23194', commit_message='Upload dataset', commit_description='', oid='353d7809c62dbcb95b2c1c915b05a993b4d23194', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/sumanpaudel1997/hindi_tweet_sampled_dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='sumanpaudel1997/hindi_tweet_sampled_dataset'), pr_revision=None, pr_num=None)