In [1]:
import os
import sys
import requests
import json

import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from tqdm import tqdm

AXIS_FONT_SIZE = 16

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# print(notebook_dir)
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

from text_generation_models import TextGenerationModelFactory
from data_acquisition import OpenMeasuresBuilder, OpenMeasuresDirector

In [2]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
open_measures_builder = OpenMeasuresBuilder()

In [4]:
open_measures_builder

<data_acquisition.OpenMeasuresBuilder at 0x138884310>

In [None]:
# # Configuring parameters
terms = "will win"
limit = 50
site = 'bluesky'
since = '2024-01-01'
until = '2025-02-18' 
esquery = 'query_string' # Elasticsearch across all fields

OpenMeasuresDirector.construct_from_dataset(builder=open_measures_builder, terms=terms, limit=limit, site=site, start_date=since, end_date=until, querytype=esquery)

Query's URL: http://api.smat-app.com/content?term=will rise&limit=100&site=bluesky&since=2024-01-01&until=2025-02-18&querytype=query_string


{'$type': 'app.bsky.feed.post',
 'author': 'did:plc:pb5rjeqxt3vgnkltkdusyibp',
 'authorProfile': {'_id': 'did:plc:pb5rjeqxt3vgnkltkdusyibp',
  '_index': 'smat-bluesky-users',
  'avatar': 'https://cdn.bsky.app/img/avatar/plain/did:plc:pb5rjeqxt3vgnkltkdusyibp/bafkreibzv5ishpgnz32vzsxovunekp2rtzcbz4sqcvjjgqazzdjllhyqwa@jpeg',
  'banner': 'https://cdn.bsky.app/img/banner/plain/did:plc:pb5rjeqxt3vgnkltkdusyibp/bafkreibt7fjfnfohjx2cg57mh7n54wcneq43nmewnptdx4yrbo7ht3qvvm@jpeg',
  'description': 'The best wrestling. The best stories. The best in Virtual Wrestling since 2013. Subscribe to the YouTube channel for more great content!\n\nhttps://www.youtube.com/@DWall4869Gaming',
  'did': 'did:plc:pb5rjeqxt3vgnkltkdusyibp',
  'display_name': 'DCA Wrestling',
  'followers': 16,
  'following': 1,
  'handle': 'dcawrestling.bsky.social',
  'indexed_at': '2024-11-20T01:17:14.904Z',
  'lastseents': '2024-11-29T21:28:40.247950',
  'posts': 2828},
 'cid': 'bafyreiep3cnudiw2prdxycf7hvrzn7uel7sdpr6bwwo472e

In [None]:
# # or make a request using requests
# r = requests.get(
#     url
# )

In [None]:
# r.status_code
# data = r.json()
# data.keys(), data

In [None]:
# hits = data['hits']['hits']
# hits[0]['_source']

In [None]:
# df = pd.DataFrame([hit['_source'] for hit in hits])
# df.head()

In [None]:
sentences = df['text'].to_list()
len(sentences)

In [None]:
tgmf = TextGenerationModelFactory()

# Groq Cloud (https://console.groq.com/docs/overview)
gemma_29b_generation_model = tgmf.create_instance('gemma2-9b-it') 
llama_318b_instant_generation_model = tgmf.create_instance('llama-3.1-8b-instant') 
llama_3370b_versatile_generation_model = tgmf.create_instance('llama-3.3-70b-versatile')  
llama_guard_4_12b_generation_model = tgmf.create_instance('meta-llama/llama-guard-4-12b')  

models = [gemma_29b_generation_model, llama_318b_instant_generation_model, llama_3370b_versatile_generation_model, llama_guard_4_12b_generation_model]

In [None]:
import json
import os

def save_to_json(data, path):
    file_number = len([file for file in os.listdir(path) if file.startswith('tiktok_comments-')]) + 1
    file_name = f'tiktok_comments-{file_number}.json'
    file_path = os.path.join(path, file_name)
    with open(file_path, 'w') as f:
        json.dump(data, f)

In [None]:
def detect_predictions_with_llms(sentences: list, notebook_dir: str):
    labels = []
    batch_size = 50
    for batch_idx in tqdm(range(0, len(sentences), batch_size)):
        batch = sentences[batch_idx:batch_idx+batch_size]
        for sentences_idx in tqdm(range(len(batch))):
            sentence = batch[sentences_idx]
            prompt = f"Given this sentence ({sentence}), state if the sentence is a prediction, not a prediction, or not enough information. Also, if it is a prediction, state the prediction domain if any are finance, health, weather, policy, sports, or miscellaneous. Do not explain or provide any other details. Only state prediction, not a prediction, or not enough information."
            # print(f"Prompt: {prompt}")
            for model in models:  
                input_prompt = model.user(prompt)
                # print(input_prompt)  
                
                raw_text_llm_generation = model.chat_completion([input_prompt])
                # print(raw_text_llm_generation)
                # print("====================================")
                for line in raw_text_llm_generation.split("\n"):
                    # print(line)
                    if line.strip():
                        labels.append({"sentence": sentence, "model": model.__name__(), "label": line})
        save_dir = os.path.dirname(notebook_dir)
        save_dir = os.path.join(save_dir, 'data', 'tiktok_comments')
        save_to_json(labels, save_dir)
    return labels

In [None]:
llms_generated = detect_predictions_with_llms(sentences, notebook_dir)
llms_generated

In [None]:
df = pd.DataFrame(llms_generated).rename(columns={'sentence': 'Text', 'model': 'Model', 'label': 'Label'})
df