In [2]:
import json
import csv
import re
import datetime

keys = ['created', 'selftext']
content_list = []
data_dict ={}

# Read the JSONL file and parse each line
with open('r_healthinsurance_posts.jsonl', 'r') as jsonl_file:
    for line in jsonl_file:
        parsed_line = json.loads(line)
        content_list.append(parsed_line)

for index, content in enumerate(content_list):
        data_dict[f'Post {index}'] = content['created'], content['selftext']
    
cleaned_dict = {}
for index, content in data_dict.items():
    date, text = content
    converted_date = datetime.datetime.utcfromtimestamp(date)
    cleaned_text = re.sub(r'[0-9]', ' ', text)
    cleaned_text = re.sub(r'[^\w\s]', ' ', cleaned_text)
    cleaned_text = cleaned_text.replace('\n', ' ').strip().split()
    #cleaned_text = re.sub(r'\W', '', text)
    cleaned_dict[converted_date] = cleaned_text

with open('dummy.csv','w',newline='') as f:
    output = csv.writer(f, delimiter=':')
    for line, content in cleaned_dict.items():
        output.writerow([line, content])


  converted_date = datetime.datetime.utcfromtimestamp(date)


In [72]:
def extract_data(jsonfilename, fields):
    ''' 
    Given a json file, produces a dictionary of reddit post data (post date, text)
    Input:
        jsonfilename: name of file containing data to be read in 
        keys: list of attribute to be extracted from json file
    Output:
        dictionary
    '''
    content_list = []
    data_dict = {}

    with open(jsonfilename, 'r') as jsonl_file:
        for line in jsonl_file:
            parsed_line = json.loads(line)
            content_list.append(parsed_line)

    for index, content in enumerate(content_list):
        data_dict[f'Post {index}'] = tuple(content[field] for field in fields if field in content)

    return data_dict


In [73]:
dict = extract_data('r_healthinsurance_posts.jsonl', ['created', 'selftext'])

In [74]:
def clean_data(data_dict):
    ''' 
    Given a dictionary mapping index numbers to unix date 
    and text information, produces a new dictionary mapping 
    translated dates to tokenized strings
    Input:
        data_dict: a dictionary
    Output:
        cleaned_dict: a dictionary
    '''
    cleaned_dict = {}
    for index, content in data_dict.items():
        date, text = content
        converted_date = datetime.datetime.utcfromtimestamp(date).strftime('%Y-%m-%d %H:%M:%S')
        cleaned_text = re.sub(r'[0-9]', ' ', text)
        cleaned_text = re.sub(r'[^\w\s]', ' ', cleaned_text)
        cleaned_text = cleaned_text.replace('\n', ' ').strip()#.split()
        cleaned_dict[converted_date] = cleaned_text
    return {k: v for k, v in cleaned_dict.items() if v and v != "removed"}

In [75]:
clean = clean_data(dict)
len(clean)

  converted_date = datetime.datetime.utcfromtimestamp(date).strftime('%Y-%m-%d %H:%M:%S')


2080

In [76]:
def to_csv(csvfilename, clean_data_dict):
    ''' 
    Given a dictionary mapping dates to tokenized strings,
    writes a CSV file mapping the same pairs
    Input:
        csvfilename: a string representing the name of the
        output csv
        clean_data_dict: a dictionary
    '''
    with open(csvfilename,'w',newline='') as f:
        output = csv.writer(f, delimiter=':')
        for line, content in clean_data_dict.items():
            output.writerow([line, content])

In [77]:
to_csv('tokenized_posts.csv', clean)

In [78]:
d2 = extract_data('r_healthinsurance_comments.jsonl', ['created', 'body'])

In [79]:
cd2= clean_data(d2)

  converted_date = datetime.datetime.utcfromtimestamp(date).strftime('%Y-%m-%d %H:%M:%S')


In [80]:
to_csv('tokenized_comments.csv', cd2)

In [81]:
clean_data(dict)

  converted_date = datetime.datetime.utcfromtimestamp(date).strftime('%Y-%m-%d %H:%M:%S')


{'2024-11-20 00:02:50': 'For a clean insurance claim  is it instantaneous acceptance if nothing is flagged  I feel like all my health insurance claims take about     weeks minimum  for the fast ones  and others pend for months  I m wondering what accounts for the backlog',
 '2024-11-20 00:17:18': 'I just saw that my employer sponsored HMO plan excludes blood  not blood work  the actual blood you need to not die after an accident   I paste the verbiage below  I was wondering if I should be concerned about this  I don t mind spending        on a once in a lifetime accident but I would hate having to spend    k or even more and it is hard to get numbers for what a hospital would charge for out of pocket patients   My other HMO option does cover it but it is        more a year for our personal contribution and the doctor locations aren t as convenient          The following are not covered services under your plan              Blood  blood plasma  synthetic blood  blood derivatives or subs

In [82]:
import random
from collections import defaultdict

def stratified_subsample(data_dict, test_size=0.3):
    '''
    Stratified sampling of data based on a criterion (e.g., date).
    Input:
        data_dict: A dictionary where keys are dates (or other stratification criteria) and values are lists of texts.
        test_size: Fraction of data to sample from each group.
    Output:
        subsample_dict: A subsample of the original dictionary with the same distribution of groups.
    '''
    # Step 1: Group posts by the stratification criterion (e.g., date)
    grouped_data = defaultdict(list)
    for key, content in data_dict.items():
        date = key.split()[0]  # Assuming the date is at the start of the key (YYYY-MM-DD)
        grouped_data[date].append(content)
    
    # Step 2: Initialize the subsample dictionary
    subsample_dict = {}
    
    # Step 3: Stratified sampling within each group
    for date, posts in grouped_data.items():
        # Shuffle the group before sampling
        random.shuffle(posts)
        
        # Calculate the number of samples to take from each group
        num_samples = int(len(posts) * test_size)
        
        # Take the stratified sample from this group
        sampled_posts = posts[:num_samples]
        
        # Add the sampled posts to the subsample dictionary
        for i, post in enumerate(sampled_posts):
            # Create unique keys for each post in the subsample
            subsample_dict[f"{date}_{i+1}"] = post
    
    return subsample_dict

In [97]:
#post_subsample = stratified_subsample(clean)
#len(post_subsample)

615

In [84]:
#to_csv('posts.csv', post_subsample)

In [4]:
import pandas as pd

In [99]:
#post_label = random.sample(list(post_subsample.items()), 180)
#df = pd.DataFrame((post_label), columns=['Date','Text'])
#df
#df.to_excel("posts_to_label.xlsx")

In [94]:
#comment_subsample = stratified_subsample(cd2)

In [95]:
#comment_label = random.sample(list(comment_subsample.items()), 1000)
#f = pd.DataFrame((comment_label), columns=['Date','Text'])
#df
#df.to_excel("comments_to_label.xlsx")

In [88]:
#to_csv('comments.csv', comment_subsample)

In [11]:
# didn't set a random seed, so need to re-import the data
labeled_comms = pd.read_excel('comments_to_label.xlsx')
comment_subset = pd.read_csv('comments.csv', header=None, names=['Raw'])
comment_subset[['ID', 'Text']] = comment_subset['Raw'].str.split(':', n=1, expand=True)
comment_subset.drop(columns=['Raw'], inplace=True)
print(comment_subset.head())

             ID                                               Text
0  2024-11-20_1  If your previous providers used epic software ...
1  2024-11-20_2  So here s the thing  If you claim your fiancee...
2  2024-11-20_3  I really don t have much advice to add as I ve...
3  2024-11-20_4  Well  they made around   k year  but my dad is...
4  2024-11-20_5  I signed up for them too without doing researc...


In [13]:
filtered_comments = comment_subset[~comment_subset['ID'].isin(labeled_comms['Date'])]
filtered_comments.head()

Unnamed: 0,ID,Text
0,2024-11-20_1,If your previous providers used epic software ...
1,2024-11-20_2,So here s the thing If you claim your fiancee...
2,2024-11-20_3,I really don t have much advice to add as I ve...
3,2024-11-20_4,Well they made around k year but my dad is...
4,2024-11-20_5,I signed up for them too without doing researc...


In [17]:
# unlabeled data (labeled data removed)
filtered_comments.to_csv("filtered_comments.csv", index=False)  # Keeping format consistent