# Queries to use for A3c, A3d, A3e, A3f


In [1]:
# Required libraries
import os
import time
import boto3
import logging
import pandas as pd
from typing import Dict

In [2]:
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
# configuration details 
SCHEMA_NAME = "schema_name"

# fill in your net id below.
netid = "imb59"
S3_STAGING_PREFIX = "data/a05"
S3_BUCKET_NAME = f"athena-{netid}"
S3_STAGING_DIR = f"s3://{S3_BUCKET_NAME}/{S3_STAGING_PREFIX}/"
S3_OUTPUT_DIRECTORY = "data"
AWS_REGION = "us-east-1"

In [4]:
# initialize the Athena client
athena_client = boto3.client("athena", region_name=AWS_REGION)

In [5]:
def download_and_load_query_results(
    client: boto3.client, query_response: Dict
) -> pd.DataFrame:
    logger.info("download_and_load_query_results, enter")
    while True:
        try:
            # This function only loads the first 1000 rows
            client.get_query_results(
                QueryExecutionId=query_response["QueryExecutionId"]
            )
            break
        except Exception as err:
            if "not yet finished" in str(err):
                time.sleep(0.5)
            else:
                raise err
    logger.info(f"Time to complete query: {time.time() - start_time}s")
    temp_file_location: str = "athena_query_results.csv"
    s3_client = boto3.client(
        "s3",
        region_name=AWS_REGION,
    )
    s3_path = os.path.join(S3_STAGING_PREFIX, f"{query_response['QueryExecutionId']}.csv")
    logger.info(f"downloading file from S3_BUCKET_NAME={S3_BUCKET_NAME}, s3_path={s3_path}, to local file {temp_file_location}")
    s3_client.download_file(
        S3_BUCKET_NAME,
        s3_path,
        temp_file_location,
    )
    df = pd.read_csv(temp_file_location)
    logger.info(f"results dataframe shape is {df.shape}")
    return df

## Sorting Subreddits Relevant to: 

(A1, B3a, B13), A2b, A3a, A3b

* “Frustrating” or “frustrat” and “cancer” (HINTS A2b)
* “cancer” and “doctors” or “trust” (i.e. does not “need” to contain trust because trust is included in the NRC sentiment analysis) (HINTS A3a)
* “cancer” and “family” or “friends” or “sister” or “brother” or “mother” or “mom” or “father” or “mother” or “cousin” or “aunt” or “uncle” or “trust” (HINTS A3b)

HINTS Questions: 

* SeekCancerInfo: A1 | Have you ever looked for information about cancer from any source?
* Electronic2_HealthInfo: B3a | In the past 12 months have you used the Internet to look for health or medical information?
* MisleadingHealthInfo: B13 | How much of the health information that you see on social media do you think is false or misleading?


In [9]:
# query to get comments containing "frustrat" and "cancer"
# Test to find submission vs comment in the reddit data 
q = '''
SELECT "body", "controversiality", "created_utc", "subreddit", 
CASE 
    WHEN parent_id LIKE 't1_%' then 'Comment'
    WHEN parent_id LIKE  't3_%' then 'Submission'
    ElSE 'Unknown'
END as post_type
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%frustrat%' AND LOWER("body") LIKE '%cancer%'
AND "subreddit" IN ('CrohnsDisease', 'thyroidcancer', 'AskDocs',
'UlcerativeColitis', 'Autoimmune', 'BladderCancer', 'breastcancer', 
'CancerFamilySupport','doihavebreastcancer',
'WomensHealth', 'ProstateCancer', 'cll' ,'Microbiome', 'predental','endometrialcancer',
'cancer','Hashimotos', 'coloncancer','PreCervicalCancer','lymphoma', 'Lymphedema', 'CancerCaregivers',  'braincancer', 'lynchsyndrome',
'nursing','testicularcancer','leukemia','publichealth', 'Health','Fuckcancer','HealthInsurance','BRCA', 'Cancersurvivors','pancreaticcancer', 'skincancer',
'stomachcancer')
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_test.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())

[2024-11-20 19:52:38,014] p126 {449603764.py:32} INFO - {'QueryExecutionId': '7973351b-970f-487e-9b78-11203a1aec99', 'ResponseMetadata': {'RequestId': '07c2b434-c568-4b65-a204-c72277244e13', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Wed, 20 Nov 2024 19:52:38 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': '07c2b434-c568-4b65-a204-c72277244e13'}, 'RetryAttempts': 0}}
[2024-11-20 19:52:38,015] p126 {272223420.py:4} INFO - download_and_load_query_results, enter
[2024-11-20 19:52:45,399] p126 {272223420.py:17} INFO - Time to complete query: 7.476020812988281s
[2024-11-20 19:52:45,405] p126 {272223420.py:24} INFO - downloading file from S3_BUCKET_NAME=athena-imb59, s3_path=data/a05/7973351b-970f-487e-9b78-11203a1aec99.csv, to local file athena_query_results.csv
[2024-11-20 19:52:45,481] p126 {272223420.py:31} INFO - results dataframe shape is (9, 5)
[2024-11-20 19:52:45,484] p126 {449603764.py:38} INFO - Resul

                                                body  controversiality  \
0  thank you for pointing out how it’s misunderst...                 0   
1  You are not alone... I am a 22y/o and I had my...                 0   
2  Yeah, could be a stomach bug, could be cancer ...                 0   
3  A mix of sad, frustrated and pissed because it...                 0   
4  I went to one recently after my neurologist sa...                 0   

   created_utc          subreddit   post_type  
0   1719799836      CrohnsDisease  Submission  
1   1719802666      thyroidcancer  Submission  
2   1719803698            AskDocs  Submission  
3   1719810673  UlcerativeColitis  Submission  
4   1719810959         Autoimmune  Submission  


In [15]:
# query to get comments containing "frustrat" and "cancer"
q = '''
SELECT "body", "controversiality", "created_utc", "subreddit",
CASE 
    WHEN parent_id LIKE 't1_%' then 'Comment'
    WHEN parent_id LIKE  't3_%' then 'Submission'
    ElSE 'Unknown'
END as post_type
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%frustrat%' AND LOWER("body") LIKE '%cancer%'
AND "subreddit" IN ('CrohnsDisease', 'thyroidcancer', 'AskDocs',
'UlcerativeColitis', 'Autoimmune', 'BladderCancer', 'breastcancer', 
'CancerFamilySupport','doihavebreastcancer',
'WomensHealth', 'ProstateCancer', 'cll' ,'Microbiome', 'predental','endometrialcancer',
'cancer','Hashimotos', 'coloncancer','PreCervicalCancer','lymphoma', 'Lymphedema', 'CancerCaregivers',  'braincancer', 'lynchsyndrome',
'nursing','testicularcancer','leukemia','publichealth', 'Health','Fuckcancer','HealthInsurance','BRCA', 'Cancersurvivors','pancreaticcancer', 'skincancer',
'stomachcancer')
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A2b.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())


[2024-11-24 19:14:11,646] p112 {2638054558.py:32} INFO - {'QueryExecutionId': 'f969fc11-fcaf-4294-85bb-afc48c4f9595', 'ResponseMetadata': {'RequestId': '6ea466c5-77b8-4b47-9c35-f1b1ca220cb2', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 24 Nov 2024 19:14:11 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': '6ea466c5-77b8-4b47-9c35-f1b1ca220cb2'}, 'RetryAttempts': 0}}
[2024-11-24 19:14:11,647] p112 {272223420.py:4} INFO - download_and_load_query_results, enter
[2024-11-24 19:14:15,932] p112 {272223420.py:17} INFO - Time to complete query: 4.374830484390259s
[2024-11-24 19:14:15,938] p112 {272223420.py:24} INFO - downloading file from S3_BUCKET_NAME=athena-imb59, s3_path=data/a05/f969fc11-fcaf-4294-85bb-afc48c4f9595.csv, to local file athena_query_results.csv
[2024-11-24 19:14:16,004] p112 {272223420.py:31} INFO - results dataframe shape is (9, 5)
[2024-11-24 19:14:16,009] p112 {2638054558.py:38} INFO - Res

                                                body  controversiality  \
0  thank you for pointing out how it’s misunderst...                 0   
1  That last line about seeing a colourful bird j...                 0   
2  It made me realise exactly how little I had. A...                 0   
3  Absolutely! There aren’t many of us with high ...                 0   
4  I'd like to think I've dealt with cancer head ...                 0   

   created_utc            subreddit   post_type  
0   1719799836        CrohnsDisease  Submission  
1   1719846063        BladderCancer  Submission  
2   1719847236         breastcancer  Submission  
3   1719851099        thyroidcancer     Comment  
4   1719826058  CancerFamilySupport  Submission  


In [16]:
# Query to get comments containing "cancer" and either "doctors" or "trust"
q = '''
SELECT "body", "controversiality", "created_utc", "subreddit",
CASE 
    WHEN parent_id LIKE 't1_%' then 'Comment'
    WHEN parent_id LIKE  't3_%' then 'Submission'
    ElSE 'Unknown'
END as post_type
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%cancer%' 
  AND (LOWER("body") LIKE '%doctors%' OR LOWER("body") LIKE '%trust%') AND "subreddit" IN ('CrohnsDisease', 'thyroidcancer', 'AskDocs',
'UlcerativeColitis', 'Autoimmune', 'BladderCancer', 'breastcancer', 
'CancerFamilySupport','doihavebreastcancer',
'WomensHealth', 'ProstateCancer', 'cll' ,'Microbiome', 'predental','endometrialcancer',
'cancer','Hashimotos', 'coloncancer','PreCervicalCancer','lymphoma', 'Lymphedema', 'CancerCaregivers',  'braincancer', 'lynchsyndrome',
'nursing','testicularcancer','leukemia','publichealth', 'Health','Fuckcancer','HealthInsurance','BRCA', 'Cancersurvivors','pancreaticcancer', 'skincancer',
'stomachcancer')
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A3a.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())


[2024-11-24 19:15:04,692] p112 {3554975800.py:32} INFO - {'QueryExecutionId': '55ded87a-43cc-44ff-9972-db43b2371064', 'ResponseMetadata': {'RequestId': 'fffc21a3-573d-4315-bc22-1069738f60de', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 24 Nov 2024 19:15:04 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': 'fffc21a3-573d-4315-bc22-1069738f60de'}, 'RetryAttempts': 0}}
[2024-11-24 19:15:04,693] p112 {272223420.py:4} INFO - download_and_load_query_results, enter
[2024-11-24 19:15:08,970] p112 {272223420.py:17} INFO - Time to complete query: 4.379390478134155s
[2024-11-24 19:15:08,978] p112 {272223420.py:24} INFO - downloading file from S3_BUCKET_NAME=athena-imb59, s3_path=data/a05/55ded87a-43cc-44ff-9972-db43b2371064.csv, to local file athena_query_results.csv
[2024-11-24 19:15:09,050] p112 {272223420.py:31} INFO - results dataframe shape is (39, 5)
[2024-11-24 19:15:09,058] p112 {3554975800.py:38} INFO - Re

                                                body  controversiality  \
0  Thanks for your reply, and I agree, I apprecia...                 0   
1  I completely understand what you are saying.\n...                 0   
2  I, 23M, went through one two months ago, and t...                 0   
3  What happened to me is the endometrial was dia...                 0   
4  We're sorry to hear that you need to visit thi...                 0   

   created_utc            subreddit   post_type  
0   1719847712                  cll     Comment  
1   1719847940           Microbiome  Submission  
2   1719848566            predental  Submission  
3   1719849610    endometrialcancer     Comment  
4   1719849824  doihavebreastcancer  Submission  


In [17]:
# Query to get comments containing "cancer" and any of the specified family-related terms or "trust"
q = '''
SELECT "body", "controversiality", "created_utc", "subreddit",
CASE 
    WHEN parent_id LIKE 't1_%' then 'Comment'
    WHEN parent_id LIKE  't3_%' then 'Submission'
    ElSE 'Unknown'
END as post_type
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%cancer%' 
  AND (
      LOWER("body") LIKE '%family%' 
      OR LOWER("body") LIKE '%friends%' 
      OR LOWER("body") LIKE '%sister%' 
      OR LOWER("body") LIKE '%brother%' 
      OR LOWER("body") LIKE '%mother%' 
      OR LOWER("body") LIKE '%mom%' 
      OR LOWER("body") LIKE '%father%' 
      OR LOWER("body") LIKE '%cousin%' 
      OR LOWER("body") LIKE '%aunt%' 
      OR LOWER("body") LIKE '%uncle%' 
      OR LOWER("body") LIKE '%trust%'
  ) AND "subreddit" IN ('CrohnsDisease', 'thyroidcancer', 'AskDocs',
'UlcerativeColitis', 'Autoimmune', 'BladderCancer', 'breastcancer', 
'CancerFamilySupport','doihavebreastcancer',
'WomensHealth', 'ProstateCancer', 'cll' ,'Microbiome', 'predental','endometrialcancer',
'cancer','Hashimotos', 'coloncancer','PreCervicalCancer','lymphoma', 'Lymphedema', 'CancerCaregivers',  'braincancer', 'lynchsyndrome',
'nursing','testicularcancer','leukemia','publichealth', 'Health','Fuckcancer','HealthInsurance','BRCA', 'Cancersurvivors','pancreaticcancer', 'skincancer',
'stomachcancer')
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A3b.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())


[2024-11-24 19:19:54,703] p112 {1633816522.py:44} INFO - {'QueryExecutionId': '8e01001f-bb91-448a-9716-779c7fea4d03', 'ResponseMetadata': {'RequestId': '533bc10b-6737-4743-b6e1-9878a0dfdb7b', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 24 Nov 2024 19:19:54 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': '533bc10b-6737-4743-b6e1-9878a0dfdb7b'}, 'RetryAttempts': 0}}
[2024-11-24 19:19:54,705] p112 {272223420.py:4} INFO - download_and_load_query_results, enter
[2024-11-24 19:19:59,549] p112 {272223420.py:17} INFO - Time to complete query: 5.023648023605347s
[2024-11-24 19:19:59,556] p112 {272223420.py:24} INFO - downloading file from S3_BUCKET_NAME=athena-imb59, s3_path=data/a05/8e01001f-bb91-448a-9716-779c7fea4d03.csv, to local file athena_query_results.csv
[2024-11-24 19:19:59,638] p112 {272223420.py:31} INFO - results dataframe shape is (84, 5)
[2024-11-24 19:19:59,646] p112 {1633816522.py:50} INFO - Re

                                                body  controversiality  \
0  Thank you so much for your comment and support...                 0   
1  Oh yes thank you for asking, I forgot to menti...                 0   
2  This subreddit is heavily centered on patients...                 0   
3  It can be inherited but can also occur as a ra...                 0   
4  I tell all my friends if the question isn’t ab...                 0   

   created_utc          subreddit   post_type  
0   1719792153  endometrialcancer     Comment  
1   1719794160            AskDocs     Comment  
2   1719795253       breastcancer  Submission  
3   1719796295      lynchsyndrome  Submission  
4   1719797344            nursing  Submission  


## Sorting Subreddits Relevant to: 

A3c, A3d, A3e, A3f

* “cancer” and government_healthcare_programs = [ "medicare", "medicaid", "children’s health insurance program", "chip", "veterans health administration", "vha", "indian health service", "ihs", "federal employees health benefits program", "fehbp", "affordable care act", "aca", "health insurance marketplace", "public health depart", "local health depart", "national health service corps", "nhsc", "community health centers", "chcs", "national institutes of health", "nih", "nci", "national cancer institute" ] or “trust” (HINTS A3c)

* “cancer” and cancer_charities = [ "american cancer society”, “acs", "cancer research institute", "breast cancer research foundation", "bcrf", "leukemia lymphoma society", "lls", "stand up to cancer", "su2c", "susan g. komen for the cure", "st. jude children’s", "national foundation for cancer research", "nfcr", "livestrong", "mesothelioma research foundation", "prostate cancer foundation", "american brain tumor association", "abta", "colon cancer coalition", "the american institute for cancer research", "aicr" ] or “trust” (HINTS A3d)
* “cancer” and charitable_religious_organizations = [ "catholic relief services”, “crs", "world vision", "samaritan", "jewish federations of north america", "islamic relief worldwide", "buddhist global relief", "the salvation army", "christian aid", "lutheran world relief", "tzu chi foundation", "care”, “cooperative for assistance and relief everywhere", "habitat for humanity", "church world service”, “cws", "heifer international" ] or “trust” (HINTS A3e)
* “cancer” and top_cancer_institutes = [ "researcher", "scientist", "physicians", "md anderson cancer center", "memorial sloan kettering cancer center", "msk", "mayo clinic cancer center", "johns hopkins sidney kimmel comprehensive cancer center", "cleveland clinic", "ucla medical center", "massachusetts general hospital cancer center", "duke cancer institute", "stanford cancer institute", "university of california, san francisco medical center", "ucsf", "northwestern medicine feinberg school of medicine", "university of pennsylvania abramson cancer center", "roswell park comprehensive cancer center", "fred hutchinson cancer research center" ] or “trust” (HINTS A3f)


HINTS Questions: 

* CancerTrustGov: A3c. In general, how much would you trust information about cancer from government health agencies?
* CancerTrustCharities: A3d. In general, how much would you trust information about cancer from charitable organizations?
* CancerTrustReligiousOrgs: A3e. In general, how much would you trust information about cancer from religious organizations and leaders?
* CancerTrustScientists: A3f. In general how much would you trust information about cancer from scientists?.

In [18]:
# Query to get comments containing "cancer" and nay goverment healthcare programs 
q = '''
SELECT "body", "controversiality", "created_utc", "subreddit",
CASE 
    WHEN parent_id LIKE 't1_%' then 'Comment'
    WHEN parent_id LIKE  't3_%' then 'Submission'
    ElSE 'Unknown'
END as post_type
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%cancer%' 
  AND (
      LOWER("body") LIKE '%medicare%' 
      OR LOWER("body") LIKE '%medicaid%' 
      OR LOWER("body") LIKE '%"children’s health insurance program%' 
      OR LOWER("body") LIKE '%chip%' 
      OR LOWER("body") LIKE '%veterans health administration%' 
      OR LOWER("body") LIKE '%vha%' 
      OR LOWER("body") LIKE '%indian health service%' 
      OR LOWER("body") LIKE '%ihs%' 
      OR LOWER("body") LIKE '%federal employees health benefits program%' 
      OR LOWER("body") LIKE '%fehbp%' 
      OR LOWER("body") LIKE '%affordable care act%'
      OR LOWER("body") LIKE '%aca%'
      OR LOWER("body") LIKE '%health insurance marketplace%'
      OR LOWER("body") LIKE '%public health depart%'
      OR LOWER("body") LIKE '%local health depart%'
      OR LOWER("body") LIKE '%national health service corps%'
      OR LOWER("body") LIKE '%nhsc%'
      OR LOWER("body") LIKE '%community health centers%'
      OR LOWER("body") LIKE '%national institutes of health%'
      OR LOWER("body") LIKE '%nih%'
      OR LOWER("body") LIKE '%nci%'
      OR LOWER("body") LIKE '%national cancer institute%'
      OR LOWER("body") LIKE '%trust%'
  ) AND "subreddit" IN ('CrohnsDisease', 'thyroidcancer', 'AskDocs',
'UlcerativeColitis', 'Autoimmune', 'BladderCancer', 'breastcancer', 
'CancerFamilySupport','doihavebreastcancer',
'WomensHealth', 'ProstateCancer', 'cll' ,'Microbiome', 'predental','endometrialcancer',
'cancer','Hashimotos', 'coloncancer','PreCervicalCancer','lymphoma', 'Lymphedema', 'CancerCaregivers',  'braincancer', 'lynchsyndrome',
'nursing','testicularcancer','leukemia','publichealth', 'Health','Fuckcancer','HealthInsurance','BRCA', 'Cancersurvivors','pancreaticcancer', 'skincancer',
'stomachcancer')
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A3c.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())

[2024-11-24 19:20:46,425] p112 {3565611620.py:56} INFO - {'QueryExecutionId': '216ff817-1431-4ae4-a23f-a1ab5bf14413', 'ResponseMetadata': {'RequestId': '6a06c8aa-7ae4-4d13-b383-3ff4cf1dd1c1', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 24 Nov 2024 19:20:46 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': '6a06c8aa-7ae4-4d13-b383-3ff4cf1dd1c1'}, 'RetryAttempts': 0}}
[2024-11-24 19:20:46,426] p112 {272223420.py:4} INFO - download_and_load_query_results, enter
[2024-11-24 19:20:50,713] p112 {272223420.py:17} INFO - Time to complete query: 4.392207622528076s
[2024-11-24 19:20:50,719] p112 {272223420.py:24} INFO - downloading file from S3_BUCKET_NAME=athena-imb59, s3_path=data/a05/216ff817-1431-4ae4-a23f-a1ab5bf14413.csv, to local file athena_query_results.csv
[2024-11-24 19:20:50,790] p112 {272223420.py:31} INFO - results dataframe shape is (31, 5)
[2024-11-24 19:20:50,795] p112 {3565611620.py:62} INFO - Re

                                                body  controversiality  \
0  When I got diagnosed with Breast cancer & I ca...                 0   
1  Thanks for your reply, and I agree, I apprecia...                 0   
2  Agreed. I'm extremely obese and I cant' even i...                 0   
3  I'm going to get downvoted into oblivion for t...                 0   
4  What happened to me is the endometrial was dia...                 0   

   created_utc          subreddit   post_type  
0   1719797834             cancer  Submission  
1   1719847712                cll     Comment  
2   1719848552            nursing     Comment  
3   1719849279            nursing  Submission  
4   1719849610  endometrialcancer     Comment  


In [19]:
# Query to get comments containing "cancer" and certain cancer chareties 
q = '''
SELECT "body", "controversiality", "created_utc", "subreddit",
CASE 
    WHEN parent_id LIKE 't1_%' then 'Comment'
    WHEN parent_id LIKE  't3_%' then 'Submission'
    ElSE 'Unknown'
END as post_type
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%cancer%' 
  AND (
      LOWER("body") LIKE '%american cancer society%' 
      OR LOWER("body") LIKE '%acs%' 
      OR LOWER("body") LIKE '%cancer research institute%' 
      OR LOWER("body") LIKE '%breast cancer research foundation%' 
      OR LOWER("body") LIKE '%veterans health administration%' 
      OR LOWER("body") LIKE '%bcrf%' 
      OR LOWER("body") LIKE '%leukemia lymphoma society%' 
      OR LOWER("body") LIKE '%ihs%' 
      OR LOWER("body") LIKE '%lls%' 
      OR LOWER("body") LIKE '%stand up to cancer%' 
      OR LOWER("body") LIKE '%su2c%'
      OR LOWER("body") LIKE '%susan g. komen for the cure%'
      OR LOWER("body") LIKE '%st. jude children’s%'
      OR LOWER("body") LIKE '%national foundation for cancer research%'
      OR LOWER("body") LIKE '%nfcr%'
      OR LOWER("body") LIKE '%livestrong%'
      OR LOWER("body") LIKE '%nhsc%'
      OR LOWER("body") LIKE '%mesothelioma research foundation%'
      OR LOWER("body") LIKE '%prostate cancer foundation%'
      OR LOWER("body") LIKE '%american brain tumor association%'
      OR LOWER("body") LIKE '%abta%'
      OR LOWER("body") LIKE '%colon cancer coalition%'
      OR LOWER("body") LIKE '%the american institute for cancer research%'
      OR LOWER("body") LIKE '%aicr%'
      OR LOWER("body") LIKE '%trust%'
  ) AND "subreddit" IN ('CrohnsDisease', 'thyroidcancer', 'AskDocs',
'UlcerativeColitis', 'Autoimmune', 'BladderCancer', 'breastcancer', 
'CancerFamilySupport','doihavebreastcancer',
'WomensHealth', 'ProstateCancer', 'cll' ,'Microbiome', 'predental','endometrialcancer',
'cancer','Hashimotos', 'coloncancer','PreCervicalCancer','lymphoma', 'Lymphedema', 'CancerCaregivers',  'braincancer', 'lynchsyndrome',
'nursing','testicularcancer','leukemia','publichealth', 'Health','Fuckcancer','HealthInsurance','BRCA', 'Cancersurvivors','pancreaticcancer', 'skincancer',
'stomachcancer')
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A3d.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())

[2024-11-24 19:21:30,027] p112 {648271266.py:58} INFO - {'QueryExecutionId': 'c75bd0b4-a6d0-4983-89b8-7109d9378383', 'ResponseMetadata': {'RequestId': '75606638-f8a9-4c57-86be-5c83a59474be', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 24 Nov 2024 19:21:30 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': '75606638-f8a9-4c57-86be-5c83a59474be'}, 'RetryAttempts': 0}}
[2024-11-24 19:21:30,028] p112 {272223420.py:4} INFO - download_and_load_query_results, enter
[2024-11-24 19:21:35,826] p112 {272223420.py:17} INFO - Time to complete query: 5.906189203262329s
[2024-11-24 19:21:35,833] p112 {272223420.py:24} INFO - downloading file from S3_BUCKET_NAME=athena-imb59, s3_path=data/a05/c75bd0b4-a6d0-4983-89b8-7109d9378383.csv, to local file athena_query_results.csv
[2024-11-24 19:21:35,914] p112 {272223420.py:31} INFO - results dataframe shape is (36, 5)
[2024-11-24 19:21:35,921] p112 {648271266.py:64} INFO - Resu

                                                body  controversiality  \
0  She could possibly be suffering from Anemia......                 0   
1  Thanks for your reply, and I agree, I apprecia...                 0   
2  I completely understand what you are saying.\n...                 0   
3  I, 23M, went through one two months ago, and t...                 0   
4  I’ll give you my experience with prednisone du...                 0   

   created_utc          subreddit   post_type  
0   1719847231            AskDocs     Comment  
1   1719847712                cll     Comment  
2   1719847940         Microbiome  Submission  
3   1719848566          predental  Submission  
4   1719848993  UlcerativeColitis     Comment  


In [20]:
# Query to get comments containing "cancer" and certain charitable religious orgs 
q = '''
SELECT "body", "controversiality", "created_utc", "subreddit",
CASE 
    WHEN parent_id LIKE 't1_%' then 'Comment'
    WHEN parent_id LIKE  't3_%' then 'Submission'
    ElSE 'Unknown'
END as post_type
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%cancer%' 
  AND (
      LOWER("body") LIKE '%catholic relief services%' 
      OR LOWER("body") LIKE '%crs%' 
      OR LOWER("body") LIKE '%world vision%' 
      OR LOWER("body") LIKE '%samaritan%' 
      OR LOWER("body") LIKE '%jewish federations of north america%' 
      OR LOWER("body") LIKE '%islamic relief worldwide%' 
      OR LOWER("body") LIKE '%buddhist global relief%' 
      OR LOWER("body") LIKE '%the salvation army%' 
      OR LOWER("body") LIKE '%christian aid%' 
      OR LOWER("body") LIKE '%lutheran world relief%' 
      OR LOWER("body") LIKE '%"tzu chi foundation%'
      OR LOWER("body") LIKE '%susan g. komen for the cure%'
      OR LOWER("body") LIKE '%care%'
      OR LOWER("body") LIKE '%cooperative for assistance and relief everywhere%'
      OR LOWER("body") LIKE '%habitat for humanity%'
      OR LOWER("body") LIKE '%church world service%'
      OR LOWER("body") LIKE '%cws%'
      OR LOWER("body") LIKE '%heifer international%'
      OR LOWER("body") LIKE '%trust%'
  ) AND "subreddit" IN ('CrohnsDisease', 'thyroidcancer', 'AskDocs',
'UlcerativeColitis', 'Autoimmune', 'BladderCancer', 'breastcancer', 
'CancerFamilySupport','doihavebreastcancer',
'WomensHealth', 'ProstateCancer', 'cll' ,'Microbiome', 'predental','endometrialcancer',
'cancer','Hashimotos', 'coloncancer','PreCervicalCancer','lymphoma', 'Lymphedema', 'CancerCaregivers',  'braincancer', 'lynchsyndrome',
'nursing','testicularcancer','leukemia','publichealth', 'Health','Fuckcancer','HealthInsurance','BRCA', 'Cancersurvivors','pancreaticcancer', 'skincancer',
'stomachcancer')
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A3e.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())

[2024-11-24 19:22:08,598] p112 {2394410194.py:52} INFO - {'QueryExecutionId': '382eb211-8a55-44ef-a552-53ec06216a38', 'ResponseMetadata': {'RequestId': 'ff60b94a-02e1-4fa4-afdc-67a8474706ca', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 24 Nov 2024 19:22:08 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': 'ff60b94a-02e1-4fa4-afdc-67a8474706ca'}, 'RetryAttempts': 0}}
[2024-11-24 19:22:08,599] p112 {272223420.py:4} INFO - download_and_load_query_results, enter
[2024-11-24 19:22:12,329] p112 {272223420.py:17} INFO - Time to complete query: 3.844054698944092s
[2024-11-24 19:22:12,337] p112 {272223420.py:24} INFO - downloading file from S3_BUCKET_NAME=athena-imb59, s3_path=data/a05/382eb211-8a55-44ef-a552-53ec06216a38.csv, to local file athena_query_results.csv
[2024-11-24 19:22:12,421] p112 {272223420.py:31} INFO - results dataframe shape is (58, 5)
[2024-11-24 19:22:12,427] p112 {2394410194.py:58} INFO - Re

                                                body  controversiality  \
0  I had my abdormal pap about a month before my ...                 0   
1  This subreddit is heavily centered on patients...                 0   
2  It sounds like you are doing fine. You had sur...                 0   
3  Ohmygosh you poor darling.\n\nBear in mind tha...                 0   
4  I'm not trying to be argumentative. Please und...                 0   

   created_utc            subreddit   post_type  
0   1719795113    PreCervicalCancer  Submission  
1   1719795253         breastcancer  Submission  
2   1719798254         breastcancer  Submission  
3   1719813592  CancerFamilySupport  Submission  
4   1719816110           Fuckcancer  Submission  


“cancer” and top_cancer_institutes = [ "researcher", "scientist", "physicians", "md anderson cancer center", "memorial sloan kettering cancer center", "msk", "mayo clinic cancer center", "johns hopkins sidney kimmel comprehensive cancer center", "cleveland clinic", "ucla medical center", "massachusetts general hospital cancer center", "duke cancer institute", "stanford cancer institute", "university of california, san francisco medical center", "ucsf", "northwestern medicine feinberg school of medicine", "university of pennsylvania abramson cancer center", "roswell park comprehensive cancer center", "fred hutchinson cancer research center" ] or “trust” (HINTS A3f)

In [21]:
# Query to get comments containing "cancer" and top cancer institutes 
q = '''
SELECT "body", "controversiality", "created_utc", "subreddit",
CASE 
    WHEN parent_id LIKE 't1_%' then 'Comment'
    WHEN parent_id LIKE  't3_%' then 'Submission'
    ElSE 'Unknown'
END as post_type
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%cancer%' 
  AND (
      LOWER("body") LIKE '%researcher%' 
      OR LOWER("body") LIKE '%scientist%' 
      OR LOWER("body") LIKE '%physicians%' 
      OR LOWER("body") LIKE '%md anderson cancer center%' 
      OR LOWER("body") LIKE '%memorial sloan kettering cancer center%' 
      OR LOWER("body") LIKE '%msk%' 
      OR LOWER("body") LIKE '%mayo clinic cancer center%' 
      OR LOWER("body") LIKE '%johns hopkins sidney kimmel comprehensive cancer center%' 
      OR LOWER("body") LIKE '%cleveland clinic%' 
      OR LOWER("body") LIKE '%ucla medical center%' 
      OR LOWER("body") LIKE '%massachusetts general hospital cancer center%'
      OR LOWER("body") LIKE '%duke cancer institute%'
      OR LOWER("body") LIKE '%stanford cancer institute%'
      OR LOWER("body") LIKE '%university of california%'
      OR LOWER("body") LIKE '%san francisco medical center%'
      OR LOWER("body") LIKE '%ucsf%'
      OR LOWER("body") LIKE '%northwestern medicine feinberg school of medicine%'
      OR LOWER("body") LIKE '%university of pennsylvania abramson cancer center%'
      OR LOWER("body") LIKE '%roswell park comprehensive cancer center%'
      OR LOWER("body") LIKE '%fred hutchinson cancer research center%'
      OR LOWER("body") LIKE '%trust%'
  ) AND "subreddit" IN ('CrohnsDisease', 'thyroidcancer', 'AskDocs',
'UlcerativeColitis', 'Autoimmune', 'BladderCancer', 'breastcancer', 
'CancerFamilySupport','doihavebreastcancer',
'WomensHealth', 'ProstateCancer', 'cll' ,'Microbiome', 'predental','endometrialcancer',
'cancer','Hashimotos', 'coloncancer','PreCervicalCancer','lymphoma', 'Lymphedema', 'CancerCaregivers',  'braincancer', 'lynchsyndrome',
'nursing','testicularcancer','leukemia','publichealth', 'Health','Fuckcancer','HealthInsurance','BRCA', 'Cancersurvivors','pancreaticcancer', 'skincancer',
'stomachcancer')
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A3f.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())

[2024-11-24 19:23:07,640] p112 {2605496797.py:54} INFO - {'QueryExecutionId': '83fb624a-6005-4995-ad2f-6c026f817348', 'ResponseMetadata': {'RequestId': '0922583f-ee0e-447e-9a1d-908679aea893', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 24 Nov 2024 19:23:07 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': '0922583f-ee0e-447e-9a1d-908679aea893'}, 'RetryAttempts': 0}}
[2024-11-24 19:23:07,641] p112 {272223420.py:4} INFO - download_and_load_query_results, enter
[2024-11-24 19:23:11,903] p112 {272223420.py:17} INFO - Time to complete query: 4.381659746170044s
[2024-11-24 19:23:11,912] p112 {272223420.py:24} INFO - downloading file from S3_BUCKET_NAME=athena-imb59, s3_path=data/a05/83fb624a-6005-4995-ad2f-6c026f817348.csv, to local file athena_query_results.csv
[2024-11-24 19:23:12,104] p112 {272223420.py:31} INFO - results dataframe shape is (13, 5)
[2024-11-24 19:23:12,109] p112 {2605496797.py:60} INFO - Re

                                                body  controversiality  \
0  I did IV, no port. I used ice bags on my hands...                 0   
1  Today was cousin went through with the decisio...                 0   
2  I have a port. It's my second one. First one I...                 0   
3  Angry bladder as in overactive bladder or Inte...                 0   
4  Thanks for your reply, and I agree, I apprecia...                 0   

   created_utc            subreddit   post_type  
0   1719802713         breastcancer  Submission  
1   1719806973  CancerFamilySupport  Submission  
2   1719808898         breastcancer  Submission  
3   1719810643        thyroidcancer  Submission  
4   1719847712                  cll     Comment  


In [22]:
# merge the datasets into one
import pandas as pd
df_query_a2b = pd.read_csv('Query_A2b.csv')
df_query_a3a = pd.read_csv('Query_A3a.csv')
df_query_a3b = pd.read_csv('Query_A3b.csv')
df_query_a3c = pd.read_csv('Query_A3c.csv')
df_query_a3d = pd.read_csv('Query_A3d.csv')
df_query_a3e = pd.read_csv('Query_A3e.csv')
df_query_a3f = pd.read_csv('Query_A3f.csv')

merged_df = pd.concat([df_query_a2b , df_query_a3a , df_query_a3b, df_query_a3c, df_query_a3d, df_query_a3e, df_query_a3f], ignore_index= True)
merged_df.to_csv('Merged_data.csv', index = False)

## Redo the Queries but now focus on values: created_utc, subreddit and user_id, body

This will focus on seeing trends over time based on user_id in the subreddits to see how users trend over time can affect comment results

In [6]:
# query to get comments containing "frustrat" and "cancer"- Focus on new values now 
q = '''
SELECT "author" as user_id, "body", "created_utc", "subreddit"
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%frustrat%' AND LOWER("body") LIKE '%cancer%'
AND "subreddit" IN ('CrohnsDisease', 'thyroidcancer', 'AskDocs',
'UlcerativeColitis', 'Autoimmune', 'BladderCancer', 'breastcancer', 
'CancerFamilySupport','doihavebreastcancer',
'WomensHealth', 'ProstateCancer', 'cll' ,'Microbiome', 'predental','endometrialcancer',
'cancer','Hashimotos', 'coloncancer','PreCervicalCancer','lymphoma', 'Lymphedema', 'CancerCaregivers',  'braincancer', 'lynchsyndrome',
'nursing','testicularcancer','leukemia','publichealth', 'Health','Fuckcancer','HealthInsurance','BRCA', 'Cancersurvivors','pancreaticcancer', 'skincancer',
'stomachcancer')
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A2b-1.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())

[2024-11-24 19:06:38,412] p112 {2142210154.py:27} INFO - {'QueryExecutionId': 'bd3159c7-0bd3-4007-a14a-f24aa648c5ad', 'ResponseMetadata': {'RequestId': '6935a815-62c9-470e-a9fb-2080f8048906', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 24 Nov 2024 19:06:38 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': '6935a815-62c9-470e-a9fb-2080f8048906'}, 'RetryAttempts': 0}}
[2024-11-24 19:06:38,413] p112 {272223420.py:4} INFO - download_and_load_query_results, enter
[2024-11-24 19:06:45,265] p112 {272223420.py:17} INFO - Time to complete query: 7.00680136680603s
[2024-11-24 19:06:45,425] p112 {272223420.py:24} INFO - downloading file from S3_BUCKET_NAME=athena-imb59, s3_path=data/a05/bd3159c7-0bd3-4007-a14a-f24aa648c5ad.csv, to local file athena_query_results.csv
[2024-11-24 19:06:45,538] p112 {272223420.py:31} INFO - results dataframe shape is (9, 4)
[2024-11-24 19:06:45,545] p112 {2142210154.py:33} INFO - Resu

             user_id                                               body  \
0          WeakGhost  That last line about seeing a colourful bird j...   
1          zombieus1  You are not alone... I am a 22y/o and I had my...   
2        MzOpinion8d  Yeah, could be a stomach bug, could be cancer ...   
3  random-nihilist87  A mix of sad, frustrated and pissed because it...   
4           3DFarmer  I went to one recently after my neurologist sa...   

   created_utc          subreddit  
0   1719846063      BladderCancer  
1   1719802666      thyroidcancer  
2   1719803698            AskDocs  
3   1719810673  UlcerativeColitis  
4   1719810959         Autoimmune  


In [7]:
# Query to get comments containing "cancer" and either "doctors" or "trust"
q = '''
SELECT "author" as user_id, "body", "created_utc", "subreddit"
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%cancer%' 
  AND (LOWER("body") LIKE '%doctors%' OR LOWER("body") LIKE '%trust%') AND "subreddit" IN ('CrohnsDisease', 'thyroidcancer', 'AskDocs',
'UlcerativeColitis', 'Autoimmune', 'BladderCancer', 'breastcancer', 
'CancerFamilySupport','doihavebreastcancer',
'WomensHealth', 'ProstateCancer', 'cll' ,'Microbiome', 'predental','endometrialcancer',
'cancer','Hashimotos', 'coloncancer','PreCervicalCancer','lymphoma', 'Lymphedema', 'CancerCaregivers',  'braincancer', 'lynchsyndrome',
'nursing','testicularcancer','leukemia','publichealth', 'Health','Fuckcancer','HealthInsurance','BRCA', 'Cancersurvivors','pancreaticcancer', 'skincancer',
'stomachcancer')
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A3a-1.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())

[2024-11-24 19:07:16,016] p112 {2907716995.py:27} INFO - {'QueryExecutionId': '204efde6-4d3a-44bd-889f-c03483b52f9e', 'ResponseMetadata': {'RequestId': '3951fb88-8083-4b24-a16f-13efd1e8c10b', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 24 Nov 2024 19:07:16 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': '3951fb88-8083-4b24-a16f-13efd1e8c10b'}, 'RetryAttempts': 0}}
[2024-11-24 19:07:16,017] p112 {272223420.py:4} INFO - download_and_load_query_results, enter
[2024-11-24 19:07:20,296] p112 {272223420.py:17} INFO - Time to complete query: 4.42334771156311s
[2024-11-24 19:07:20,306] p112 {272223420.py:24} INFO - downloading file from S3_BUCKET_NAME=athena-imb59, s3_path=data/a05/204efde6-4d3a-44bd-889f-c03483b52f9e.csv, to local file athena_query_results.csv
[2024-11-24 19:07:20,382] p112 {272223420.py:31} INFO - results dataframe shape is (39, 4)
[2024-11-24 19:07:20,390] p112 {2907716995.py:33} INFO - Res

          user_id                                               body  \
0   AutoModerator  We're sorry to hear that you need to visit thi...   
1   AutoModerator  We're sorry to hear that you need to visit thi...   
2   AutoModerator  We're sorry to hear that you need to visit thi...   
3  rubyslippers3x  I did IV, no port. I used ice bags on my hands...   
4    lil_Jakester  Today was cousin went through with the decisio...   

   created_utc            subreddit  
0   1719792189  doihavebreastcancer  
1   1719799351  doihavebreastcancer  
2   1719801443  doihavebreastcancer  
3   1719802713         breastcancer  
4   1719806973  CancerFamilySupport  


In [8]:
# Query to get comments containing "cancer" and any of the specified family-related terms or "trust"
q = '''
SELECT "author" as user_id, "body", "created_utc", "subreddit"
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%cancer%' 
  AND (
      LOWER("body") LIKE '%family%' 
      OR LOWER("body") LIKE '%friends%' 
      OR LOWER("body") LIKE '%sister%' 
      OR LOWER("body") LIKE '%brother%' 
      OR LOWER("body") LIKE '%mother%' 
      OR LOWER("body") LIKE '%mom%' 
      OR LOWER("body") LIKE '%father%' 
      OR LOWER("body") LIKE '%cousin%' 
      OR LOWER("body") LIKE '%aunt%' 
      OR LOWER("body") LIKE '%uncle%' 
      OR LOWER("body") LIKE '%trust%'
  ) AND "subreddit" IN ('CrohnsDisease', 'thyroidcancer', 'AskDocs',
'UlcerativeColitis', 'Autoimmune', 'BladderCancer', 'breastcancer', 
'CancerFamilySupport','doihavebreastcancer',
'WomensHealth', 'ProstateCancer', 'cll' ,'Microbiome', 'predental','endometrialcancer',
'cancer','Hashimotos', 'coloncancer','PreCervicalCancer','lymphoma', 'Lymphedema', 'CancerCaregivers',  'braincancer', 'lynchsyndrome',
'nursing','testicularcancer','leukemia','publichealth', 'Health','Fuckcancer','HealthInsurance','BRCA', 'Cancersurvivors','pancreaticcancer', 'skincancer',
'stomachcancer')
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A3b-1.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())


[2024-11-24 19:07:42,975] p112 {113105506.py:39} INFO - {'QueryExecutionId': 'd41dc7e6-bd73-411d-8c70-5facab5beaf4', 'ResponseMetadata': {'RequestId': '9ce931e2-7b56-424a-af11-e7922e3f0fe2', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 24 Nov 2024 19:07:42 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': '9ce931e2-7b56-424a-af11-e7922e3f0fe2'}, 'RetryAttempts': 0}}
[2024-11-24 19:07:42,976] p112 {272223420.py:4} INFO - download_and_load_query_results, enter
[2024-11-24 19:07:46,758] p112 {272223420.py:17} INFO - Time to complete query: 3.868135452270508s
[2024-11-24 19:07:46,765] p112 {272223420.py:24} INFO - downloading file from S3_BUCKET_NAME=athena-imb59, s3_path=data/a05/d41dc7e6-bd73-411d-8c70-5facab5beaf4.csv, to local file athena_query_results.csv
[2024-11-24 19:07:46,849] p112 {272223420.py:31} INFO - results dataframe shape is (84, 4)
[2024-11-24 19:07:46,857] p112 {113105506.py:45} INFO - Resu

                user_id                                               body  \
0  Spiritual-Freedom-71  Thank you so much for your comment and support...   
1          batmannjoker  Oh yes thank you for asking, I forgot to menti...   
2  breastcancer-ModTeam  This subreddit is heavily centered on patients...   
3      Chikorita_banana  It can be inherited but can also occur as a ra...   
4     Shreddy_Spaghett1  I tell all my friends if the question isn’t ab...   

   created_utc          subreddit  
0   1719792153  endometrialcancer  
1   1719794160            AskDocs  
2   1719795253       breastcancer  
3   1719796295      lynchsyndrome  
4   1719797344            nursing  


In [9]:
# Query to get comments containing "cancer" and nay goverment healthcare programs 
q = '''
SELECT "author" as user_id, "body", "created_utc", "subreddit"
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%cancer%' 
  AND (
      LOWER("body") LIKE '%medicare%' 
      OR LOWER("body") LIKE '%medicaid%' 
      OR LOWER("body") LIKE '%"children’s health insurance program%' 
      OR LOWER("body") LIKE '%chip%' 
      OR LOWER("body") LIKE '%veterans health administration%' 
      OR LOWER("body") LIKE '%vha%' 
      OR LOWER("body") LIKE '%indian health service%' 
      OR LOWER("body") LIKE '%ihs%' 
      OR LOWER("body") LIKE '%federal employees health benefits program%' 
      OR LOWER("body") LIKE '%fehbp%' 
      OR LOWER("body") LIKE '%affordable care act%'
      OR LOWER("body") LIKE '%aca%'
      OR LOWER("body") LIKE '%health insurance marketplace%'
      OR LOWER("body") LIKE '%public health depart%'
      OR LOWER("body") LIKE '%local health depart%'
      OR LOWER("body") LIKE '%national health service corps%'
      OR LOWER("body") LIKE '%nhsc%'
      OR LOWER("body") LIKE '%community health centers%'
      OR LOWER("body") LIKE '%national institutes of health%'
      OR LOWER("body") LIKE '%nih%'
      OR LOWER("body") LIKE '%nci%'
      OR LOWER("body") LIKE '%national cancer institute%'
      OR LOWER("body") LIKE '%trust%'
  ) AND "subreddit" IN ('CrohnsDisease', 'thyroidcancer', 'AskDocs',
'UlcerativeColitis', 'Autoimmune', 'BladderCancer', 'breastcancer', 
'CancerFamilySupport','doihavebreastcancer',
'WomensHealth', 'ProstateCancer', 'cll' ,'Microbiome', 'predental','endometrialcancer',
'cancer','Hashimotos', 'coloncancer','PreCervicalCancer','lymphoma', 'Lymphedema', 'CancerCaregivers',  'braincancer', 'lynchsyndrome',
'nursing','testicularcancer','leukemia','publichealth', 'Health','Fuckcancer','HealthInsurance','BRCA', 'Cancersurvivors','pancreaticcancer', 'skincancer',
'stomachcancer')
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A3c-1.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())

[2024-11-24 19:08:14,928] p112 {2729342127.py:51} INFO - {'QueryExecutionId': '869ae994-25bd-42bc-b06f-b258dea5b0d9', 'ResponseMetadata': {'RequestId': '339db641-5798-4a8c-8cf0-31966551dd6e', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 24 Nov 2024 19:08:14 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': '339db641-5798-4a8c-8cf0-31966551dd6e'}, 'RetryAttempts': 0}}
[2024-11-24 19:08:14,930] p112 {272223420.py:4} INFO - download_and_load_query_results, enter
[2024-11-24 19:08:19,212] p112 {272223420.py:17} INFO - Time to complete query: 4.359502792358398s
[2024-11-24 19:08:19,220] p112 {272223420.py:24} INFO - downloading file from S3_BUCKET_NAME=athena-imb59, s3_path=data/a05/869ae994-25bd-42bc-b06f-b258dea5b0d9.csv, to local file athena_query_results.csv
[2024-11-24 19:08:19,306] p112 {272223420.py:31} INFO - results dataframe shape is (31, 4)
[2024-11-24 19:08:19,319] p112 {2729342127.py:57} INFO - Re

              user_id                                               body  \
0          NiceAd1978  When I got diagnosed with Breast cancer & I ca...   
1       RipDouble8475  Thanks for your reply, and I agree, I apprecia...   
2  AngeredReclusivity  Agreed. I'm extremely obese and I cant' even i...   
3  AngeredReclusivity  I'm going to get downvoted into oblivion for t...   
4      createhomelife  What happened to me is the endometrial was dia...   

   created_utc          subreddit  
0   1719797834             cancer  
1   1719847712                cll  
2   1719848552            nursing  
3   1719849279            nursing  
4   1719849610  endometrialcancer  


In [10]:
# Query to get comments containing "cancer" and certain cancer chareties 
q = '''
SELECT "author" as user_id, "body", "created_utc", "subreddit"
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%cancer%' 
  AND (
      LOWER("body") LIKE '%american cancer society%' 
      OR LOWER("body") LIKE '%acs%' 
      OR LOWER("body") LIKE '%cancer research institute%' 
      OR LOWER("body") LIKE '%breast cancer research foundation%' 
      OR LOWER("body") LIKE '%veterans health administration%' 
      OR LOWER("body") LIKE '%bcrf%' 
      OR LOWER("body") LIKE '%leukemia lymphoma society%' 
      OR LOWER("body") LIKE '%ihs%' 
      OR LOWER("body") LIKE '%lls%' 
      OR LOWER("body") LIKE '%stand up to cancer%' 
      OR LOWER("body") LIKE '%su2c%'
      OR LOWER("body") LIKE '%susan g. komen for the cure%'
      OR LOWER("body") LIKE '%st. jude children’s%'
      OR LOWER("body") LIKE '%national foundation for cancer research%'
      OR LOWER("body") LIKE '%nfcr%'
      OR LOWER("body") LIKE '%livestrong%'
      OR LOWER("body") LIKE '%nhsc%'
      OR LOWER("body") LIKE '%mesothelioma research foundation%'
      OR LOWER("body") LIKE '%prostate cancer foundation%'
      OR LOWER("body") LIKE '%american brain tumor association%'
      OR LOWER("body") LIKE '%abta%'
      OR LOWER("body") LIKE '%colon cancer coalition%'
      OR LOWER("body") LIKE '%the american institute for cancer research%'
      OR LOWER("body") LIKE '%aicr%'
      OR LOWER("body") LIKE '%trust%'
  ) AND "subreddit" IN ('CrohnsDisease', 'thyroidcancer', 'AskDocs',
'UlcerativeColitis', 'Autoimmune', 'BladderCancer', 'breastcancer', 
'CancerFamilySupport','doihavebreastcancer',
'WomensHealth', 'ProstateCancer', 'cll' ,'Microbiome', 'predental','endometrialcancer',
'cancer','Hashimotos', 'coloncancer','PreCervicalCancer','lymphoma', 'Lymphedema', 'CancerCaregivers',  'braincancer', 'lynchsyndrome',
'nursing','testicularcancer','leukemia','publichealth', 'Health','Fuckcancer','HealthInsurance','BRCA', 'Cancersurvivors','pancreaticcancer', 'skincancer',
'stomachcancer')
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A3d-1.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())

[2024-11-24 19:09:06,806] p112 {177700367.py:53} INFO - {'QueryExecutionId': '327220a3-9a03-440d-b357-7cc009d77261', 'ResponseMetadata': {'RequestId': 'a5000124-78c4-47ae-97c4-7bfe22f8f977', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 24 Nov 2024 19:09:06 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': 'a5000124-78c4-47ae-97c4-7bfe22f8f977'}, 'RetryAttempts': 0}}
[2024-11-24 19:09:06,808] p112 {272223420.py:4} INFO - download_and_load_query_results, enter
[2024-11-24 19:09:11,051] p112 {272223420.py:17} INFO - Time to complete query: 4.35570216178894s
[2024-11-24 19:09:11,057] p112 {272223420.py:24} INFO - downloading file from S3_BUCKET_NAME=athena-imb59, s3_path=data/a05/327220a3-9a03-440d-b357-7cc009d77261.csv, to local file athena_query_results.csv
[2024-11-24 19:09:11,124] p112 {272223420.py:31} INFO - results dataframe shape is (36, 4)
[2024-11-24 19:09:11,130] p112 {177700367.py:59} INFO - Resul

                user_id                                               body  \
0           No-Law-4434  from Google Ai\n\nThis report dives deep into ...   
1              MarsMorn  I was given tramadol which I took the evening ...   
2  -this-is-my-account-  I had my abdormal pap about a month before my ...   
3  -this-is-my-account-  My doctor said they can tell from the biopsy s...   
4            NiceAd1978  When I got diagnosed with Breast cancer & I ca...   

   created_utc          subreddit  
0   1719793890           lymphoma  
1   1719795072       breastcancer  
2   1719795113  PreCervicalCancer  
3   1719795287  PreCervicalCancer  
4   1719797834             cancer  


In [11]:
# Query to get comments containing "cancer" and certain charitable religious orgs 
q = '''
SELECT "author" as user_id, "body", "created_utc", "subreddit"
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%cancer%' 
  AND (
      LOWER("body") LIKE '%catholic relief services%' 
      OR LOWER("body") LIKE '%crs%' 
      OR LOWER("body") LIKE '%world vision%' 
      OR LOWER("body") LIKE '%samaritan%' 
      OR LOWER("body") LIKE '%jewish federations of north america%' 
      OR LOWER("body") LIKE '%islamic relief worldwide%' 
      OR LOWER("body") LIKE '%buddhist global relief%' 
      OR LOWER("body") LIKE '%the salvation army%' 
      OR LOWER("body") LIKE '%christian aid%' 
      OR LOWER("body") LIKE '%lutheran world relief%' 
      OR LOWER("body") LIKE '%"tzu chi foundation%'
      OR LOWER("body") LIKE '%susan g. komen for the cure%'
      OR LOWER("body") LIKE '%care%'
      OR LOWER("body") LIKE '%cooperative for assistance and relief everywhere%'
      OR LOWER("body") LIKE '%habitat for humanity%'
      OR LOWER("body") LIKE '%church world service%'
      OR LOWER("body") LIKE '%cws%'
      OR LOWER("body") LIKE '%heifer international%'
      OR LOWER("body") LIKE '%trust%'
  ) AND "subreddit" IN ('CrohnsDisease', 'thyroidcancer', 'AskDocs',
'UlcerativeColitis', 'Autoimmune', 'BladderCancer', 'breastcancer', 
'CancerFamilySupport','doihavebreastcancer',
'WomensHealth', 'ProstateCancer', 'cll' ,'Microbiome', 'predental','endometrialcancer',
'cancer','Hashimotos', 'coloncancer','PreCervicalCancer','lymphoma', 'Lymphedema', 'CancerCaregivers',  'braincancer', 'lynchsyndrome',
'nursing','testicularcancer','leukemia','publichealth', 'Health','Fuckcancer','HealthInsurance','BRCA', 'Cancersurvivors','pancreaticcancer', 'skincancer',
'stomachcancer')
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A3e-1.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())

[2024-11-24 19:09:26,269] p112 {1838044428.py:47} INFO - {'QueryExecutionId': '88607f8d-b761-4686-9b44-2c4ae0cd7f7a', 'ResponseMetadata': {'RequestId': 'e0e7a879-f71c-45b4-a034-6462ee35569e', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 24 Nov 2024 19:09:26 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': 'e0e7a879-f71c-45b4-a034-6462ee35569e'}, 'RetryAttempts': 0}}
[2024-11-24 19:09:26,270] p112 {272223420.py:4} INFO - download_and_load_query_results, enter
[2024-11-24 19:09:30,012] p112 {272223420.py:17} INFO - Time to complete query: 3.8777997493743896s
[2024-11-24 19:09:30,020] p112 {272223420.py:24} INFO - downloading file from S3_BUCKET_NAME=athena-imb59, s3_path=data/a05/88607f8d-b761-4686-9b44-2c4ae0cd7f7a.csv, to local file athena_query_results.csv
[2024-11-24 19:09:30,103] p112 {272223420.py:31} INFO - results dataframe shape is (58, 4)
[2024-11-24 19:09:30,111] p112 {1838044428.py:53} INFO - R

                user_id                                               body  \
0          wanna_be_doc  This really doesn’t sound like lung cancer at ...   
1  Defiant_Brother_1172  I had to comment here, not because I can think...   
2         cajunlady1972  By no means am I a doctor & I don’t have child...   
3       No-Frosting3857  Just one? “The Real Anthony Fauci reveals how ...   
4             BestiaVir  I don't know if this helps. But now is the bes...   

   created_utc            subreddit  
0   1719828367              AskDocs  
1   1719828932     CancerCaregivers  
2   1719830928  doihavebreastcancer  
3   1719830990         publichealth  
4   1719831260          braincancer  


In [12]:
# Query to get comments containing "cancer" and top cancer institutes 
q = '''
SELECT "author" as user_id, "body", "created_utc", "subreddit"
FROM "AwsDataCatalog"."a05"."a05"
WHERE LOWER("body") LIKE '%cancer%' 
  AND (
      LOWER("body") LIKE '%researcher%' 
      OR LOWER("body") LIKE '%scientist%' 
      OR LOWER("body") LIKE '%physicians%' 
      OR LOWER("body") LIKE '%md anderson cancer center%' 
      OR LOWER("body") LIKE '%memorial sloan kettering cancer center%' 
      OR LOWER("body") LIKE '%msk%' 
      OR LOWER("body") LIKE '%mayo clinic cancer center%' 
      OR LOWER("body") LIKE '%johns hopkins sidney kimmel comprehensive cancer center%' 
      OR LOWER("body") LIKE '%cleveland clinic%' 
      OR LOWER("body") LIKE '%ucla medical center%' 
      OR LOWER("body") LIKE '%massachusetts general hospital cancer center%'
      OR LOWER("body") LIKE '%duke cancer institute%'
      OR LOWER("body") LIKE '%stanford cancer institute%'
      OR LOWER("body") LIKE '%university of california%'
      OR LOWER("body") LIKE '%san francisco medical center%'
      OR LOWER("body") LIKE '%ucsf%'
      OR LOWER("body") LIKE '%northwestern medicine feinberg school of medicine%'
      OR LOWER("body") LIKE '%university of pennsylvania abramson cancer center%'
      OR LOWER("body") LIKE '%roswell park comprehensive cancer center%'
      OR LOWER("body") LIKE '%fred hutchinson cancer research center%'
      OR LOWER("body") LIKE '%trust%'
  ) AND "subreddit" IN ('CrohnsDisease', 'thyroidcancer', 'AskDocs',
'UlcerativeColitis', 'Autoimmune', 'BladderCancer', 'breastcancer', 
'CancerFamilySupport','doihavebreastcancer',
'WomensHealth', 'ProstateCancer', 'cll' ,'Microbiome', 'predental','endometrialcancer',
'cancer','Hashimotos', 'coloncancer','PreCervicalCancer','lymphoma', 'Lymphedema', 'CancerCaregivers',  'braincancer', 'lynchsyndrome',
'nursing','testicularcancer','leukemia','publichealth', 'Health','Fuckcancer','HealthInsurance','BRCA', 'Cancersurvivors','pancreaticcancer', 'skincancer',
'stomachcancer')
'''

# Start the query execution
start_time = time.time()
response = athena_client.start_query_execution(
    QueryString=q,
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

# Fetch and load the results
logger.info(response)
df_filtered_comments = download_and_load_query_results(athena_client, response)

# Save the filtered comments to a CSV file
output_file = "Query_A3f-1.csv"
df_filtered_comments.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
logger.info(f"Data fetched and committed in {time.time() - start_time}s")

# Preview the filtered DataFrame
print(df_filtered_comments.head())

[2024-11-24 19:09:55,095] p112 {873688501.py:49} INFO - {'QueryExecutionId': '4de1e8ab-e7ad-4073-aed4-30ac5f5ef828', 'ResponseMetadata': {'RequestId': '258071bc-81df-4988-9b2b-94d6407b0d3c', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Sun, 24 Nov 2024 19:09:55 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': '258071bc-81df-4988-9b2b-94d6407b0d3c'}, 'RetryAttempts': 0}}
[2024-11-24 19:09:55,096] p112 {272223420.py:4} INFO - download_and_load_query_results, enter
[2024-11-24 19:09:59,856] p112 {272223420.py:17} INFO - Time to complete query: 4.85335898399353s
[2024-11-24 19:09:59,864] p112 {272223420.py:24} INFO - downloading file from S3_BUCKET_NAME=athena-imb59, s3_path=data/a05/4de1e8ab-e7ad-4073-aed4-30ac5f5ef828.csv, to local file athena_query_results.csv
[2024-11-24 19:09:59,939] p112 {272223420.py:31} INFO - results dataframe shape is (13, 4)
[2024-11-24 19:09:59,944] p112 {873688501.py:55} INFO - Resul

          user_id                                               body  \
0   cajunlady1972  By no means am I a doctor & I don’t have child...   
1       BestiaVir  I don't know if this helps. But now is the bes...   
2  rubyslippers3x  I did IV, no port. I used ice bags on my hands...   
3    lil_Jakester  Today was cousin went through with the decisio...   
4          allemm  I have a port. It's my second one. First one I...   

   created_utc            subreddit  
0   1719830928  doihavebreastcancer  
1   1719831260          braincancer  
2   1719802713         breastcancer  
3   1719806973  CancerFamilySupport  
4   1719808898         breastcancer  


In [13]:
# merge the datasets into one
import pandas as pd
df_query_a2b = pd.read_csv('Query_A2b-1.csv')
df_query_a3a = pd.read_csv('Query_A3a-1.csv')
df_query_a3b = pd.read_csv('Query_A3b-1.csv')
df_query_a3c = pd.read_csv('Query_A3c-1.csv')
df_query_a3d = pd.read_csv('Query_A3d-1.csv')
df_query_a3e = pd.read_csv('Query_A3e-1.csv')
df_query_a3f = pd.read_csv('Query_A3f-1.csv')

merged_df = pd.concat([df_query_a2b , df_query_a3a , df_query_a3b, df_query_a3c, df_query_a3d, df_query_a3e, df_query_a3f], ignore_index= True)
merged_df.to_csv('Merged_data-user_ids.csv', index = False)