In [1]:
import os
import sys
import requests
import json

import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from tqdm import tqdm

AXIS_FONT_SIZE = 16

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# print(notebook_dir)
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

from data_processing import DataProcessing
from data_acquisition import OpenMeasuresDirector
from text_generation_models import TextGenerationModelFactory

In [2]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
# Configuring parameters
terms = "(nba OR mlb OR nfl) AND (will win)"
terms = "will win"
terms_for_query = OpenMeasuresDirector
limit = 100
since = '2024-01-01'
until = '2025-02-18' 
esquery = 'query_string' # Elasticsearch across all fields

sites = ["tiktok_comment", "bluesky", "truth_social"]
hits_per_site_dfs = []
for site in sites:
    hits_for_site_df = OpenMeasuresDirector.construct_from_dataset(terms=terms, limit=limit, site=site, start_date=since, end_date=until, querytype=esquery)
    hits_per_site_dfs.append(hits_for_site_df)

Query's URL: http://api.smat-app.com/content?term=will win&limit=100&site=tiktok_comment&since=2024-01-01&until=2025-02-18&querytype=query_string


KeyError: 'hits'

In [None]:
hits_per_site_dfs

In [None]:
tiktok_df = hits_per_site_dfs[0]
bluesocial_df = hits_per_site_dfs[1]
true_social_df = hits_per_site_dfs[2]
true_social_df['text'] = true_social_df['content_cleaned']
true_social_df.head(3)

In [None]:
# true_social_df['text'] == true_social_df['content_cleaned']

In [None]:
tgmf = TextGenerationModelFactory()

# Groq Cloud (https://console.groq.com/docs/overview)
gemma_29b_generation_model = tgmf.create_instance('gemma2-9b-it') 
llama_318b_instant_generation_model = tgmf.create_instance('llama-3.1-8b-instant') 
llama_3370b_versatile_generation_model = tgmf.create_instance('llama-3.3-70b-versatile')  
llama_guard_4_12b_generation_model = tgmf.create_instance('meta-llama/llama-guard-4-12b')  

models = [gemma_29b_generation_model, llama_318b_instant_generation_model, llama_3370b_versatile_generation_model, llama_guard_4_12b_generation_model]

In [None]:
import json
import os

def save_to_json(data, path, site):
    site = f"{site}s"
    print(site)
    file_number = len([file for file in os.listdir(path) if file.startswith('{site}-')]) + 1
    file_name = f'{site}-{file_number}.json'
    file_path = os.path.join(path, file_name)
    with open(file_path, 'w') as f:
        json.dump(data, f)

In [None]:
def detect_predictions_with_llms(df: pd.DataFrame, notebook_dir: str, site: str):
    labels = []
    batch_size = 50
    show_data = 1

    sentences = DataProcessing.df_to_list(df, col='text')
    
    for batch_idx in tqdm(range(0, len(sentences), batch_size)):
        batch = sentences[batch_idx:batch_idx+batch_size]
        for sentences_idx in tqdm(range(len(batch))):
            sentence = batch[sentences_idx]
            if show_data <=3: 
                print(f"       {sentence} --- {site}")
                show_data += 1
            prompt = f"Given this sentence ({sentence}), state if the sentence is a prediction, not a prediction, or not enough information. Also, if it is a prediction, state the prediction domain if any are finance, health, weather, policy, sports, or miscellaneous. Do not explain or provide any other details. Only state prediction, not a prediction, or not enough information."
            # print(f"Prompt: {prompt}")
            for model in models:  
                input_prompt = model.user(prompt)
                # print(input_prompt)  
                
                raw_text_llm_generation = model.chat_completion([input_prompt])
                # print(raw_text_llm_generation)
                # print("====================================")
                for line in raw_text_llm_generation.split("\n"):
                    # print(line)
                    if line.strip():
                        labels.append({"sentence": sentence, "model": model.__name__(), "label": line})
        save_dir = os.path.dirname(notebook_dir)
        save_dir = os.path.join(save_dir, 'data', 'open_measures', site)
        save_to_json(labels, save_dir, site)
    return labels

In [None]:
tiktok_site = tiktok_df['Site'][0]
llms_generated_for_tiktok_comments = detect_predictions_with_llms(tiktok_df, notebook_dir, tiktok_site)
llms_generated_for_tiktok_comments_df = pd.DataFrame(llms_generated_for_tiktok_comments).rename(columns={'sentence': 'Text', 'model': 'Model', 'label': 'Label'})
llms_generated_for_tiktok_comments_df['Site'] = tiktok_site
llms_generated_for_tiktok_comments_df.head(7)

In [None]:
llms_generated_for_bluesocial_comments = detect_predictions_with_llms(bluesocial_df, notebook_dir)
llms_generated_for_bluesocial_comments_df = pd.DataFrame(llms_generated_for_bluesocial_comments).rename(columns={'sentence': 'Text', 'model': 'Model', 'label': 'Label'})
llms_generated_for_bluesocial_comments_df.head(7)

In [None]:
llms_generated_for_true_social_comments = detect_predictions_with_llms(true_social_df, notebook_dir)
llms_generated_for_true_social_comments_df = pd.DataFrame(llms_generated_for_true_social_comments).rename(columns={'sentence': 'Text', 'model': 'Model', 'label': 'Label'})
llms_generated_for_true_social_comments_df.head(7)