In [1]:
import os
import sys
import requests
import json

import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from tqdm import tqdm

AXIS_FONT_SIZE = 16

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# print(notebook_dir)
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

from data_processing import DataProcessing
from real_data_acquisition import OpenMeasuresDirector

In [2]:
pd.set_option('max_colwidth', 800)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

## Create prompt

In [3]:
prediction_properties = """a prediction <p> = (<p_s>, <p_t>, <p_d>, <p_o>), where it consists of the following four properties:

    1. <p_s>, any source entity in the sports domain.
        - Can be a person (with a name) or a sports domain person such as a sports reporter, sports analyst, sports expert, sports top executive, sports senior level person, etc), civilian.
        - Can only be an organization that is associated with the sports prediction.
    2. <p_t>, any target entity in the sports domain.
	    - Can be a person (with a name) or a sports person such as a sports reporter, sports analyst, sports expert, sports top executive, sports senior level person, etc).
        - Can only be an organization that is associated with the sports prediction.
    3. <p_d>, date or time range when <p> is expected to come to fruition or when one should observe the <p>.
        - Forecast can range from a second to anytime in the future.
        - Answers the questions: "How far to go out from today?" or "Where to stop?".
    4. <p_o>, sports prediction outcome.
        - Details relevant details such as outcome, a quantifiable metric, or slope.
        - Some example outcomes are the following: score, touchdown, goal, points, win, lose, etc.
"""

prediction_structures = """Here are how some sports predictions are structured:
    - sports template 1: <p_s> forecasts that the <p_o> at <p_t> potentially decrease in <p_d>.
    - sports template 2: On <p_d>, <p_s> speculates the <p_o> at <p_t> will likely increase.
    - sports template 3: <p_s> predicts on <p_d>, the <p_t> <p_o> may rise.
    - sports template 4: According to <p_s>, the <p_o> at <p_t> would fall in <p_d>.
    - sports template 5: In <p_d>, <p_s> envisions that <p_t> <p_o> has some probability to remain stable.
    - sports template 6: <p_t> <p_o> should stay same <p_d>, according to <p_s>. 
"""

sport_examples = """Here are some corresponding examples of sports predictions:
    - sport examples for template 1:
        1. Coach Lisa Martinez predicts that the touchdown rate at the Miami Dolphins will fall in 2020 of October.
        2. Analyst Mark Johnson forecasts that the goal average at Manchester United will stay the same in November 2025.
        3. Ryan forecasts win percentage he has for soccer will go up in 12/25/2016.
    - sport examples for template 2:
        1. On Sep 20, 2100, Coach Maria Lopez suggests that the score average at the Chicago Bulls is climbing.
        2. On 9/12/2025, Analyst David Kim anticipates the touchdown rate at the Kansas City Chiefs will likely surge.
        3. On October 8, 2123, Detravious foresees that the win probability he has for rugby is expected to trend downward.
    - sport examples for template 3:
        1. Coach Elena Ruiz predicts on 9/22/2025, the goal count at Real Madrid will climb.
        2. Analyst Marcus Lee forecasts that on Sep 30, 2055, the point average at the Golden State Warriors will be higher.
        3. George Jr. estimates that on October 15, 2035, the win ratio for games he has will disimprove.
    - sport examples for template 4:
        1. According to Coach Sarah Nguyen, the scoring average at the Dallas Mavericks is expected to dip in Sep 2021.
        2. According to Analyst Trevor Simmons, the touchdown rate at the Green Bay Packers will increase in 10/2025.
        3. According to Manchester United, the win percentage at Manchester United is projected to drop in October 2034.
    - sport examples for template 5:
        1. In 9/2025, Coach Miguel Torres envisions that the goal average at Paris Saint-Germain will hold steady.
        2. In October 2056, Analyst Fiona Bennett anticipates that the win rate at the Toronto Raptors will decrease slightly.
        3. In Sep 2086, Calvin foresees that the points per game he has in football will gradually increase.
    - sport examples for template 6:
        1. The goal count at Liverpool FC will surge in Sep 2012, according to Coach Daniel Alvarez.
        2. The win percentage at the Chicago Bears will taper off in October 2025, according to Analyst Priya Sharma.
        3. The scoring average on Arnolds footbal team will remain steady in 10/2034, according to Arnold.
"""

sport_requirements = """- Should be based on real-world sports.
    - Suppose the time when <p> was made is during any season of sports.
    - Include reports from all sports professionals, coaches, or any type of sport entity.
"""

initial_query_string = ["""(NFL OR nfl) AND (playoffs) AND (Super Bowl)"""]
# query_strings = [
#     """(NFL OR nfl OR National Football League) AND (playoffs OR post season) AND (Super Bowl OR Championship)""",
#     """(NBA OR nba OR National Basketball Association) AND (playoffs OR post season) AND (Finals OR Championship)""",
#     """(MLB OR mlb OR Major League Baseball) AND (playoffs OR post season) AND (World Series OR Championship)""", 
#     """(NHL OR nhl OR National Hockey League) AND (playoffs OR post season) AND (Stanley Cup OR Championship)""",
#     """(MLS OR mls OR Major League Soccer) AND (playoffs OR post season) AND (MLS Cup OR Championship)""",
#     """(NCAA Football OR college football OR CFB) AND (playoffs OR post season OR bowl games) AND (National Championship OR CFP Championship)""",
#     """(NCAA Basketball OR college basketball) AND (playoffs OR March Madness) AND (Elite Eight OR Final Four OR National Championship)""",
#     """(WNBA OR wnba OR Women's National Basketball Association) AND (playoffs OR post season) AND (Finals OR Championship)""",
#     """(UFC OR ufc OR Ultimate Fighting Championship) AND (title fight OR championship) AND (main event OR pay-per-view)"""
# ]

# query_string_2 = [
#     """NFL OR NBA OR MLB OR NHL OR MLS OR NCAA Football OR NCAA Basketball OR WNBA OR UFC"""
#     """National Football League OR National Basketball Association OR Major League Baseball OR National Hockey League OR
#     Major League Soccer OR college football OR college basketball"""
#     """playoffs OR post season OR play-in tournament OR bowl game OR March Madness OR Elite Eight OR Final Four OR title fight"""
#     """Super Bowl OR Championship OR Finals OR World Series OR Stanley Cup OR MLS Cup OR CFP Championship OR Title Fight or Main Event OR Pay-per-View"""
# ]

# query_strings = [
#     "(NFL AND 'quarterfinals' OR 'semifinals') AND ('Super Bowl' OR 'Championship')",
#     "(NBA AND 'playoffs' OR 'post season') AND ('Finals' OR 'Championship')",
#     "(MLB AND 'playoffs' OR 'post season') AND ('World Series' OR 'Championship')",
#     "(NHL AND 'playoffs' OR 'post season') AND ('Stanley Cup' OR 'Championship')",
#     "(MLS AND 'playoffs' OR 'post season') AND ('MLS Cup' OR 'Championship')",
#     "(NCAA Football AND 'playoffs' OR 'post season' OR 'bowl games') AND ('National Championship' OR 'CFP Championship')",
#     "(NCAA Basketball AND 'playoffs' OR 'March Madness') AND ('Elite Eight' OR 'Final Four' OR 'National Championship')",
#     "(WNBA AND 'playoffs' OR 'post season') AND ('Finals' OR 'Championship')",
#     "(UFC AND 'playoffs' OR 'post season') AND ('title fight' OR 'championship')"
# ]

In [4]:
prompt = f"""Generate a query string using boolean logic and keywords (related to sports predictions) to search a database. I define {prediction_properties} 
{prediction_structures}
{sport_examples}
These sports predictions can be found in social media data at large. My task here is to query the site to find relatable sentences (that aren't predictions) and prediction sentences). 
My initial query string: {initial_query_string[0]}. Don't use brackets to wrap words nor to use quotation marks to wrap words. 
I need you to generate an improved (better prediction precision) query string taking into consideration the above along with {sport_requirements} \n Don't generate anything other than a new/imporved query string!
"""
prompt

'Generate a query string using boolean logic and keywords (related to sports predictions) to search a database. I define a prediction <p> = (<p_s>, <p_t>, <p_d>, <p_o>), where it consists of the following four properties:\n\n    1. <p_s>, any source entity in the sports domain.\n        - Can be a person (with a name) or a sports domain person such as a sports reporter, sports analyst, sports expert, sports top executive, sports senior level person, etc), civilian.\n        - Can only be an organization that is associated with the sports prediction.\n    2. <p_t>, any target entity in the sports domain.\n\t    - Can be a person (with a name) or a sports person such as a sports reporter, sports analyst, sports expert, sports top executive, sports senior level person, etc).\n        - Can only be an organization that is associated with the sports prediction.\n    3. <p_d>, date or time range when <p> is expected to come to fruition or when one should observe the <p>.\n        - Forecast 

## Query for Data

- For query string, have user define `initial_query_string` or have any LLM in `text_generation_models.py` to generate via the prompt. Either way, the system is set up for user feedback. With this, check the query string (and url for data). If good with it type 'agree'. If not, add details. The details will append to old prompt.

In [5]:
# Configuring parameters
terms_for_query = OpenMeasuresDirector
query_string_by = 'user'
limit = 100
# NFL playoffs (The NFL playoffs for the 2024 season began on January 11, 2025, and concluded with Super Bowl LIX on February 9 at Caesars Superdome in New Orleans, Louisiana | WIKI)
since = '2000-09-05'
until = '2015-09-05' 
esquery = 'query_string' # Elasticsearch across all fields

# sites = ["tiktok_comment", "bluesky", "truth_social"]
sites = ["bluesky", "truth_social"]
hits_per_site_dfs = []
for site in sites:
    for query_string in initial_query_string: 
        print(f"Query {site} with query string: {query_string}")
        hits_for_site_df = OpenMeasuresDirector.construct_from_dataset(query_string=query_string, query_string_by=query_string_by, limit=limit, site=site, start_date=since, end_date=until, querytype=esquery)
        hits_per_site_dfs.append(hits_for_site_df)

hits_per_site_dfs

Query bluesky with query string: (NFL OR nfl) AND (playoffs) AND (Super Bowl)
### RESET ###
### USER SPECIFY QUERY STRINGS ###
	Query String: (<class 'str'>, '(NFL OR nfl) AND (playoffs) AND (Super Bowl)')

### SET QUERY ###
	Query's URL: http://api.smat-app.com/content?term=(NFL OR nfl) AND (playoffs) AND (Super Bowl)&limit=100&site=bluesky&since=2000-09-05&until=2015-09-05&querytype=query_string

### GET RAW HITS ###
	No hits/Failed to retrieve data: 429
Query truth_social with query string: (NFL OR nfl) AND (playoffs) AND (Super Bowl)
### RESET ###
### USER SPECIFY QUERY STRINGS ###
	Query String: (<class 'str'>, '(NFL OR nfl) AND (playoffs) AND (Super Bowl)')

### SET QUERY ###
	Query's URL: http://api.smat-app.com/content?term=(NFL OR nfl) AND (playoffs) AND (Super Bowl)&limit=100&site=truth_social&since=2000-09-05&until=2015-09-05&querytype=query_string

### GET RAW HITS ###
	No hits/Failed to retrieve data: 429


[[], []]

## Data Processing

+ Data per site (bluesky and truth social) is stored in list called `hits_per_site_dfs`.
+ Each site has data (pd.DataFrame) in list.
+ Concat all DFs per site.
+ Per Dr. Grant, time for each query process.

In [6]:
# tiktok_dfs = hits_per_site_dfs[0]
# tiktok_df = DataProcessing.concat_dfs(tiktok_dfs)

bluesky_dfs = hits_per_site_dfs[0]
bluesky_df = DataProcessing.concat_dfs(bluesky_dfs)

truth_social_dfs = hits_per_site_dfs[1]
truth_social_df = DataProcessing.concat_dfs(truth_social_dfs)
truth_social_df['text'] = truth_social_df['content_cleaned']
# true_social_df.tail(3)

ValueError: No objects to concatenate

In [None]:
truth_social_dfs

## Save Data

- Previously, I was saving a new file everytime. Now, I'm thinking to save to same file. Stopping mid process as Dr. Grant wants us to collect already existing datasets instead of building own dataset.

In [None]:
base_file = False

if base_file:
    print("####### Save Original #######")
    # path = os.path.join(notebook_dir, '../', 'data/open_measures/', 'bluesky_raw_data/')
    path = os.path.join(notebook_dir, '../', 'data/open_measures/', 'practice/')
    # prefix = f"bluesky-{limit}-{since}_to_{until}"
    prefix = "bluesky"
    DataProcessing.save_to_file(bluesky_df, path, prefix, 'csv')

    path = os.path.join(notebook_dir, '../', 'data/open_measures/', 'practice/')
    prefix = "truth_social"
    DataProcessing.save_to_file(truth_social_df, path, prefix, 'csv')
else:
    print("####### Read, Update, Save Original + New #######")
    path = os.path.join(notebook_dir, '../', 'data/open_measures/', 'practice/bluesky-v1.csv')
    original_bluesky_df = pd.read_csv(path)
    updated_bluesky_df = DataProcessing.concat_dfs([original_bluesky_df, bluesky_df])
    prefix = "bluesky"
    # DataProcessing.save_to_file(updated_bluesky_df, path, prefix, 'csv')

    # path = os.path.join(notebook_dir, '../', 'data/open_measures/', 'practice/truth_social-v1.csv')
    # original_truth_social_df = pd.read_csv(path)
    # updated_truth_social_df = DataProcessing.concat_dfs([original_truth_social_df, truth_social_df])
    # prefix = "bluesky"
    # DataProcessing.save_to_file(updated_bluesky_df, path, prefix, 'csv')


In [None]:
original_bluesky_df.drop(original_bluesky_df.columns[original_bluesky_df.columns.str.contains('^Unnamed')], axis=1, inplace=True)

In [None]:
bluesky_df['text']

In [None]:
original_bluesky_df['text'] == bluesky_df['text']

In [None]:
bluesky_df

In [None]:
updated_bluesky_df