In [1]:
import os
import sys
import requests
import json

import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from tqdm import tqdm

AXIS_FONT_SIZE = 16

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# print(notebook_dir)
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

from log_files import LogData
from data_processing import DataProcessing
from real_data_acquisition import OpenMeasuresDirector
from text_generation_models import TextGenerationModelFactory

In [2]:
pd.set_option('max_colwidth', 800)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

## Create prompt

In [3]:
prediction_properties = """a prediction <p> = (<p_s>, <p_t>, <p_d>, <p_o>), where it consists of the following four properties:

    1. <p_s>, any source entity in the sports domain.
        - Can be a person (with a name) or a sports domain person such as a sports reporter, sports analyst, sports expert, sports top executive, sports senior level person, etc), civilian.
        - Can only be an organization that is associated with the sports prediction.
    2. <p_t>, any target entity in the sports domain.
	    - Can be a person (with a name) or a sports person such as a sports reporter, sports analyst, sports expert, sports top executive, sports senior level person, etc).
        - Can only be an organization that is associated with the sports prediction.
    3. <p_d>, date or time range when <p> is expected to come to fruition or when one should observe the <p>.
        - Forecast can range from a second to anytime in the future.
        - Answers the questions: "How far to go out from today?" or "Where to stop?".
    4. <p_o>, sports prediction outcome.
        - Details relevant details such as outcome, a quantifiable metric, or slope.
        - Some example outcomes are the following: score, touchdown, goal, points, win, lose, etc.
"""

prediction_structures = """Here are how some sports predictions are structured:
    - sports template 1: <p_s> forecasts that the <p_o> at <p_t> potentially decrease in <p_d>.
    - sports template 2: On <p_d>, <p_s> speculates the <p_o> at <p_t> will likely increase.
    - sports template 3: <p_s> predicts on <p_d>, the <p_t> <p_o> may rise.
    - sports template 4: According to <p_s>, the <p_o> at <p_t> would fall in <p_d>.
    - sports template 5: In <p_d>, <p_s> envisions that <p_t> <p_o> has some probability to remain stable.
    - sports template 6: <p_t> <p_o> should stay same <p_d>, according to <p_s>. 
"""

sport_examples = """Here are some corresponding examples of sports predictions:
    - sport examples for template 1:
        1. Coach Lisa Martinez predicts that the touchdown rate at the Miami Dolphins will fall in 2020 of October.
        2. Analyst Mark Johnson forecasts that the goal average at Manchester United will stay the same in November 2025.
        3. Ryan forecasts win percentage he has for soccer will go up in 12/25/2016.
    - sport examples for template 2:
        1. On Sep 20, 2100, Coach Maria Lopez suggests that the score average at the Chicago Bulls is climbing.
        2. On 9/12/2025, Analyst David Kim anticipates the touchdown rate at the Kansas City Chiefs will likely surge.
        3. On October 8, 2123, Detravious foresees that the win probability he has for rugby is expected to trend downward.
    - sport examples for template 3:
        1. Coach Elena Ruiz predicts on 9/22/2025, the goal count at Real Madrid will climb.
        2. Analyst Marcus Lee forecasts that on Sep 30, 2055, the point average at the Golden State Warriors will be higher.
        3. George Jr. estimates that on October 15, 2035, the win ratio for games he has will disimprove.
    - sport examples for template 4:
        1. According to Coach Sarah Nguyen, the scoring average at the Dallas Mavericks is expected to dip in Sep 2021.
        2. According to Analyst Trevor Simmons, the touchdown rate at the Green Bay Packers will increase in 10/2025.
        3. According to Manchester United, the win percentage at Manchester United is projected to drop in October 2034.
    - sport examples for template 5:
        1. In 9/2025, Coach Miguel Torres envisions that the goal average at Paris Saint-Germain will hold steady.
        2. In October 2056, Analyst Fiona Bennett anticipates that the win rate at the Toronto Raptors will decrease slightly.
        3. In Sep 2086, Calvin foresees that the points per game he has in football will gradually increase.
    - sport examples for template 6:
        1. The goal count at Liverpool FC will surge in Sep 2012, according to Coach Daniel Alvarez.
        2. The win percentage at the Chicago Bears will taper off in October 2025, according to Analyst Priya Sharma.
        3. The scoring average on Arnolds footbal team will remain steady in 10/2034, according to Arnold.
"""

sport_requirements = """- Should be based on real-world sports.
    - Suppose the time when <p> was made is during any season of sports.
    - Include reports from all sports professionals, coaches, or any type of sport entity.
"""

initial_query_string = """(NFL OR nfl) AND (playoffs) AND (Super Bowl LIX)"""

In [4]:
prompt = f"""Generate a query string using boolean logic and keywords (related to sports predictions) to search a database. I define {prediction_properties} 
{prediction_structures}
{sport_examples}
These sports predictions can be found in social media data at large. My task here is to query the site to find relatable sentences (that aren't predictions) and prediction sentences). 
My initial query string: {initial_query_string}. Don't use brackets to wrap words nor to use quotation marks to wrap words. 
I need you to generate an improved (better prediction precision) query string taking into consideration the above along with {sport_requirements} \n Don't generate anything other than a new/imporved query string!
"""
prompt

'Generate a query string using boolean logic and keywords (related to sports predictions) to search a database. I define a prediction <p> = (<p_s>, <p_t>, <p_d>, <p_o>), where it consists of the following four properties:\n\n    1. <p_s>, any source entity in the sports domain.\n        - Can be a person (with a name) or a sports domain person such as a sports reporter, sports analyst, sports expert, sports top executive, sports senior level person, etc), civilian.\n        - Can only be an organization that is associated with the sports prediction.\n    2. <p_t>, any target entity in the sports domain.\n\t    - Can be a person (with a name) or a sports person such as a sports reporter, sports analyst, sports expert, sports top executive, sports senior level person, etc).\n        - Can only be an organization that is associated with the sports prediction.\n    3. <p_d>, date or time range when <p> is expected to come to fruition or when one should observe the <p>.\n        - Forecast 

## Query for Data

- For query string, have user define `initial_query_string` or have any LLM in `text_generation_models.py` to generate via the prompt. Either way, the system is set up for user feedback. With this, check the query string (and url for data). If good with it type 'agree'. If not, add details. The details will append to old prompt.

In [5]:
# Configuring parameters
terms_for_query = OpenMeasuresDirector
query_string_by = 'user'
limit = 2000
# NFL playoffs (The NFL playoffs for the 2024 season began on January 11, 2025, and concluded with Super Bowl LIX on February 9 at Caesars Superdome in New Orleans, Louisiana | WIKI)
since = '2015-09-05'
until = '2025-02-09' 
esquery = 'query_string' # Elasticsearch across all fields

# sites = ["tiktok_comment", "bluesky", "truth_social"]
sites = ["bluesky", "truth_social"]
hits_per_site_dfs = []
for site in sites:
    hits_for_site_df = OpenMeasuresDirector.construct_from_dataset(query_string=initial_query_string, query_string_by=query_string_by, limit=limit, site=site, start_date=since, end_date=until, querytype=esquery)
    hits_per_site_dfs.append(hits_for_site_df)

hits_per_site_dfs

### RESET ###
### USER SPECIFY QUERY STRINGS ###
	Query String: (<class 'str'>, '(NFL OR nfl) AND (playoffs) AND (Super Bowl LIX)')

### SET QUERY ###
	Query's URL: http://api.smat-app.com/content?term=(NFL OR nfl) AND (playoffs) AND (Super Bowl LIX)&limit=2000&site=bluesky&since=2015-09-05&until=2025-02-09&querytype=query_string

### GET RAW HITS ###
	Hits: 200
Hits retrieved:
                  $type                            author  \
0    app.bsky.feed.post  did:plc:ahok3cgd4c7rhnwrnofapqls   
1    app.bsky.feed.post  did:plc:xcgwe2tojxjmatdlrd6yvfch   
2    app.bsky.feed.post  did:plc:ti665rl3446mp2hc6i47nadp   
3    app.bsky.feed.post  did:plc:tsosz6fe72jxrhelo2i3p6dy   
4    app.bsky.feed.post  did:plc:mcb6n67plnrlx4lg35natk2b   
..                  ...                               ...   
655  app.bsky.feed.post  did:plc:cywy7vw3shrn7vp3ybgfrx33   
656  app.bsky.feed.post  did:plc:nrr6yppar26qag7p2q3rawp7   
657  app.bsky.feed.post  did:plc:m67kp6uoter7aeftq5nkzncm   
658  app.

[[                  $type                            author  \
  0    app.bsky.feed.post  did:plc:ahok3cgd4c7rhnwrnofapqls   
  1    app.bsky.feed.post  did:plc:xcgwe2tojxjmatdlrd6yvfch   
  2    app.bsky.feed.post  did:plc:ti665rl3446mp2hc6i47nadp   
  3    app.bsky.feed.post  did:plc:tsosz6fe72jxrhelo2i3p6dy   
  4    app.bsky.feed.post  did:plc:mcb6n67plnrlx4lg35natk2b   
  ..                  ...                               ...   
  655  app.bsky.feed.post  did:plc:cywy7vw3shrn7vp3ybgfrx33   
  656  app.bsky.feed.post  did:plc:nrr6yppar26qag7p2q3rawp7   
  657  app.bsky.feed.post  did:plc:m67kp6uoter7aeftq5nkzncm   
  658  app.bsky.feed.post  did:plc:eyaz2kbzyxmg5hgkhb3w7s25   
  659  app.bsky.feed.post  did:plc:5jycdvkvabnon545dxcisari   
  
                                                                                                                                                                                                                                                 

## Data Processing

+ Data per site (bluesky and truth social) is stored in list called `hits_per_site_dfs`.
+ Each site has data (pd.DataFrame) in list.
+ Concat all DFs per site.
+ Per Dr. Grant, time for each query process.

In [6]:
# tiktok_dfs = hits_per_site_dfs[0]
# tiktok_df = DataProcessing.concat_dfs(tiktok_dfs)

bluesky_dfs = hits_per_site_dfs[0]
bluesky_df = DataProcessing.concat_dfs(bluesky_dfs)

true_social_dfs = hits_per_site_dfs[1]
true_social_df = DataProcessing.concat_dfs(true_social_dfs)
true_social_df['text'] = true_social_df['content_cleaned']
true_social_df.tail(3)

Unnamed: 0,account,bookmarked,card,collected_by,content,content_cleaned,created_at,datatype,downvotes_count,emojis,...,application,text,tombstone,tv,group_timeline_visible,last_seen_ts,reblog,openmeasures_media,Query Params,Site
157,"{'acct': 'vickieski', 'display_name': 'Vickie Dembinski', 'id': '107834840758287063', 'username': 'vickieski'}",False,,smat-scrapy-crawlers,"<p>For some time, I feel the message to the public which many people know, the NFL, NBA etc are all manipulated. Lower terms ""rigged"", anywhere there is big money, we all know who it is controlled by. Like the Romans said, ""Make them an arena"". Only a few big players, making the big money (hush money) get the play book way before the season starts and what is going to happen and to make it happen. I believe many college sports are also, esp football $$$ .. but does not involve the players, it involves the coaches and the college, based on making the right plays or plays to make it fail on purpose. Lets sit back and watch.. plus the Super Bowl is the highest human trafficking event all year. Since White Hats have been in control, they have been scooping up many criminals involve...","For some time, I feel the message to the public which many people know, the NFL, NBA etc are all manipulated. Lower terms ""rigged"", anywhere there is big money, we all know who it is controlled by. Like the Romans said, ""Make them an arena"". Only a few big players, making the big money (hush money) get the play book way before the season starts and what is going to happen and to make it happen. I believe many college sports are also, esp football $$$ .. but does not involve the players, it involves the coaches and the college, based on making the right plays or plays to make it fail on purpose. Lets sit back and watch.. plus the Super Bowl is the highest human trafficking event all year. Since White Hats have been in control, they have been scooping up many criminals involved t...",2025-02-08T00:19:42.311+00:00,post,,[],...,,"For some time, I feel the message to the public which many people know, the NFL, NBA etc are all manipulated. Lower terms ""rigged"", anywhere there is big money, we all know who it is controlled by. Like the Romans said, ""Make them an arena"". Only a few big players, making the big money (hush money) get the play book way before the season starts and what is going to happen and to make it happen. I believe many college sports are also, esp football $$$ .. but does not involve the players, it involves the coaches and the college, based on making the right plays or plays to make it fail on purpose. Lets sit back and watch.. plus the Super Bowl is the highest human trafficking event all year. Since White Hats have been in control, they have been scooping up many criminals involved t...",,,,,,"[{'_hash': '00d0f292df771a8839799409f19c9fddc9b94101', 'thumbnail_hash': '2869e7dc97b3797481e043643dd0ad8fca89e31a', 'thumbnail_mimetype': 'image/jpeg', 'mimetype': 'image/jpeg', 'source_id': '113965398687669786', 'source_url': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/media_attachments/files/113/965/398/687/669/786/original/335b082a95bbbd03.jpg', 'enrichments': [{'service': 'blip', 'calculated_date': '2025-02-08T06:36:03.195116+00:00', 'type': 'image_caption', 'value': 'a screenshot of a screenshot of a screenshot of a game'}, {'service': 'tesseract', 'calculated_date': '2025-02-08T06:36:03.195128+00:00', 'type': 'OCR', 'value': 'Cy LE a ea Rea CoA Mie e Tan ean eal as (-L FATMMEsTo) (G7 W-Es3) ee eer} Q'}]}, {'_hash': '0a2eabdb13cbe2b6e94b1d772c7cdf9f8ee1a8e2', 't...","{'term': '(NFL OR nfl) AND (playoffs) AND (Super Bowl LIX)', 'limit': 2000, 'site': 'truth_social', 'since': '2015-09-05', 'until': '2025-02-09', 'querytype': 'query_string', 'model': 'user'}",truth_social
158,"{'acct': 'billrogers76', 'display_name': 'W A R Liberty', 'id': '107910638877822210', 'username': 'billrogers76'}",False,,smat-scrapy-crawlers,"<p>It is just boys playing a gameüèàüòú<br/>How much do NFL players get paid in the playoffs? The CBA also spells out how much players are paid in each *round of the postseason. Here are the üòµ‚Äçüí´bonuses for the 2024 regular season:<br/>\t‚Ä¢\tDivision winner: $54,500<br/>\t‚Ä¢\tWild Card/first-round *bye: $49,500<br/>\t‚Ä¢\tDivisional Round: $54,500<br/>\t‚Ä¢\tConference Championship: $77,000 <br/>\t‚Ä¢\tSuper Bowl *losing team: $96,000<br/>\t‚Ä¢\tSuper Bowl winning team: $171,000</p>","It is just boys playing a gameüèàüòúHow much do NFL players get paid in the playoffs? The CBA also spells out how much players are paid in each *round of the postseason. Here are the üòµ‚Äçüí´bonuses for the 2024 regular season:\t‚Ä¢\tDivision winner: $54,500\t‚Ä¢\tWild Card/first-round *bye: $49,500\t‚Ä¢\tDivisional Round: $54,500\t‚Ä¢\tConference Championship: $77,000 \t‚Ä¢\tSuper Bowl *losing team: $96,000\t‚Ä¢\tSuper Bowl winning team: $171,000",2025-02-08T07:08:42.339+00:00,post,,[],...,,"It is just boys playing a gameüèàüòúHow much do NFL players get paid in the playoffs? The CBA also spells out how much players are paid in each *round of the postseason. Here are the üòµ‚Äçüí´bonuses for the 2024 regular season:\t‚Ä¢\tDivision winner: $54,500\t‚Ä¢\tWild Card/first-round *bye: $49,500\t‚Ä¢\tDivisional Round: $54,500\t‚Ä¢\tConference Championship: $77,000 \t‚Ä¢\tSuper Bowl *losing team: $96,000\t‚Ä¢\tSuper Bowl winning team: $171,000",,,,,,,"{'term': '(NFL OR nfl) AND (playoffs) AND (Super Bowl LIX)', 'limit': 2000, 'site': 'truth_social', 'since': '2015-09-05', 'until': '2025-02-09', 'querytype': 'query_string', 'model': 'user'}",truth_social
159,"{'acct': 'nicolespink70', 'display_name': 'Nikki', 'id': '112194771027584744', 'username': 'nicolespink70'}",False,,smat-scrapy-crawlers,"<p>Really wish you would have Elon look into the referees that are in the pockets for KC. They should have never even made it to the playoffs. Many fans are disgusted with NFL and helping KC because of Taylor Swift! Many won‚Äôt watch Super Bowl because of it and are loosing fans. Remember when the fans boycotted NFL because of disrespect while our National Anthem was being played, and they did it for you as well. Please look into this so NFL doesn‚Äôt ruin the love of the great American game.Thank you for everything you do. You are Gods blessing.‚úùÔ∏èüôè‚ô•Ô∏è</p>","Really wish you would have Elon look into the referees that are in the pockets for KC. They should have never even made it to the playoffs. Many fans are disgusted with NFL and helping KC because of Taylor Swift! Many won‚Äôt watch Super Bowl because of it and are loosing fans. Remember when the fans boycotted NFL because of disrespect while our National Anthem was being played, and they did it for you as well. Please look into this so NFL doesn‚Äôt ruin the love of the great American game.Thank you for everything you do. You are Gods blessing.‚úùÔ∏èüôè‚ô•Ô∏è",2025-02-08T09:15:11.973+00:00,comment,0.0,[],...,,"Really wish you would have Elon look into the referees that are in the pockets for KC. They should have never even made it to the playoffs. Many fans are disgusted with NFL and helping KC because of Taylor Swift! Many won‚Äôt watch Super Bowl because of it and are loosing fans. Remember when the fans boycotted NFL because of disrespect while our National Anthem was being played, and they did it for you as well. Please look into this so NFL doesn‚Äôt ruin the love of the great American game.Thank you for everything you do. You are Gods blessing.‚úùÔ∏èüôè‚ô•Ô∏è",,,,,,,"{'term': '(NFL OR nfl) AND (playoffs) AND (Super Bowl LIX)', 'limit': 2000, 'site': 'truth_social', 'since': '2015-09-05', 'until': '2025-02-09', 'querytype': 'query_string', 'model': 'user'}",truth_social


## Save Data

In [7]:
path = os.path.join(notebook_dir, '../', 'data/open_measures/', 'bluesky_raw_data/')
prefix = f"blue_sky-{since}_{until}"
DataProcessing.save_to_file(bluesky_df, path, prefix, 'csv')

path = os.path.join(notebook_dir, '../', 'data/open_measures/', 'truth_social_raw_data/')
prefix = f"truth_social-{since}_{until}"
DataProcessing.save_to_file(true_social_df, path, prefix, 'csv')

Saved to: 
	/Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_identification_experiments/../data/open_measures/bluesky_raw_data/blue_sky-2015-09-05_2025-02-09-1.csv
Saved to: 
	/Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_identification_experiments/../data/open_measures/truth_social_raw_data/truth_social-2015-09-05_2025-02-09-1.csv
