In [5]:
import pandas as pd
import json

In [6]:
def load_jsonl_files(file_path: str):
    """
    Load multiple JSONL files from a directory pattern.
    
    Args:
        file_path: file path for the text jsonl data
    
    Returns:
        A dictionary where keys are filenames (without path) and values are lists of parsed JSON objects
    """
    data = []
        
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Error on line: {line}")
                print(f"Error details: {e}")
    return data

In [10]:
deepseek_cmts  = load_jsonl_files('reddit_text_data/r_deepseek_comments.jsonl')
deepseek_cmts[1]

{'_meta': {'retrieved_2nd_on': 1735435077},
 'all_awardings': [],
 'approved_at_utc': None,
 'approved_by': None,
 'archived': False,
 'associated_award': None,
 'author': 'No_Seaworthiness9278',
 'author_flair_background_color': None,
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_template_id': None,
 'author_flair_text': None,
 'author_flair_text_color': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_5mzxik23',
 'author_is_blocked': False,
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'banned_at_utc': None,
 'banned_by': None,
 'body': 'Same',
 'can_gild': False,
 'can_mod_post': False,
 'collapsed': False,
 'collapsed_because_crowd_control': None,
 'collapsed_reason': None,
 'collapsed_reason_code': None,
 'comment_type': None,
 'controversiality': 0,
 'created': 1735305466,
 'created_utc': 1735305466,
 'distinguished': None,
 'downs': 0,
 'edited': False,
 'gilded': 0,
 'gildings': {},
 'id': 'm410gks',
 'is_

In [11]:
# read line by line
deepseek_cmts = []
with open('reddit_text_data/r_deepseek_comments.jsonl', 'r') as file:
    for line in file:
        try:
            deepseek_cmts.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Error on line: {line}")
            print(f"Error details: {e}")

# Now you can access the data
if deepseek_cmts:
    print(deepseek_cmts[0])

{'_meta': {'removal_type': 'removed', 'retrieved_2nd_on': 1732288706, 'was_deleted_later': True}, 'all_awardings': [], 'approved_at_utc': None, 'approved_by': None, 'archived': False, 'associated_award': None, 'author': 'Wooden_Flamingo_4759', 'author_flair_background_color': None, 'author_flair_css_class': None, 'author_flair_richtext': [], 'author_flair_template_id': None, 'author_flair_text': None, 'author_flair_text_color': None, 'author_flair_type': 'text', 'author_fullname': 't2_1dclpy0fv6', 'author_is_blocked': False, 'author_patreon_flair': False, 'author_premium': False, 'awarders': [], 'banned_at_utc': None, 'banned_by': None, 'body': "at least you know it won't be passed to Israel?", 'can_gild': False, 'can_mod_post': False, 'collapsed': False, 'collapsed_because_crowd_control': None, 'collapsed_reason': None, 'collapsed_reason_code': None, 'comment_type': None, 'controversiality': 0, 'created': 1732159096, 'created_utc': 1732159096, 'distinguished': None, 'downs': 0, 'edite

In [27]:
len(deepseek_cmts)

32517