In [1]:
import os
import sys
import requests
import json

import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from tqdm import tqdm

AXIS_FONT_SIZE = 16

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# print(notebook_dir)
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

from text_generation_models import TextGenerationModelFactory

In [2]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
# Configuring parameters
params = {
  'term' : 'sports',
  'limit': 2000,
  'site': 'tiktok_comment',
  'since': '2021-01-01',
  'until': '2022-05-31',
  'esquery': 'false'
}

# we can create a URL to represent this query
url = 'http://api.smat-app.com/content?{}'.format(
    '&'.join(
        [f"{k}={v}" for k,v in params.items()]
    )
)

url

'http://api.smat-app.com/content?term=sports&limit=2000&site=tiktok_comment&since=2021-01-01&until=2022-05-31&esquery=false'

In [4]:
# or make a request using requests
r = requests.get(
    url
)

In [5]:
r.status_code
data = r.json()
data.keys()

dict_keys(['created_key', 'content_key', 'took', 'timed_out', '_shards', 'hits'])

In [6]:
hits = data['hits']['hits']
hits[0]['_source']

{'author': 'robertrodgers074',
 'author_id': '6827158817260864518',
 'author_pin': False,
 'aweme_id': '6912862620349385990',
 'cid': '6912892195309813766',
 'collect_stat': 0,
 'collected_by': 'smat-scrapy-crawlers',
 'comment_language': '',
 'create_time': 1609533143,
 'datatype': 'comment',
 'digg_count': 0,
 'is_author_digged': False,
 'label_list': None,
 'no_show': False,
 'reply_comment': None,
 'reply_comment_total': 0,
 'reply_id': '0',
 'reply_to_reply_id': '0',
 'status': 1,
 'stick_position': 0,
 'text': 'well at least the state of Michigan can be proud of that, because our sports teams suck!!!!',
 'text_extra': [],
 'trans_btn_style': 0,
 'user_buried': False,
 'user_digged': 0}

In [7]:
df = pd.DataFrame([hit['_source'] for hit in hits])
df.head()

Unnamed: 0,author,author_id,author_pin,aweme_id,cid,collect_stat,collected_by,comment_language,create_time,datatype,digg_count,is_author_digged,label_list,no_show,reply_comment,reply_comment_total,reply_id,reply_to_reply_id,status,stick_position,text,text_extra,trans_btn_style,user_buried,user_digged,allow_download_photo,comment_post_item_ids,image_list,is_comment_translatable,is_high_purchase_intent,sort_tags,lastseents,sort_extra_score,forbid_reply_with_video,comment_type,label_text,label_type
0,robertrodgers074,6827158817260864518,False,6912862620349385990,6912892195309813766,0,smat-scrapy-crawlers,,1609533143,comment,0,False,,False,,0,0,0,1,0,"well at least the state of Michigan can be proud of that, because our sports teams suck!!!!",[],0,False,0,,,,,,,,,,,,
1,cool.kid.6661,6790033232790012933,False,6912890483073109253,6912927456471564293,0,smat-scrapy-crawlers,un,1609541354,comment,2,False,,False,,1,0,0,1,0,Are muslims allowed to go on sports?❤️,[],0,False,0,True,,,False,False,{},2025-08-11T22:42:34.030688,,,,,
2,tweesa.nugwen,6692142114979644422,False,6904693167656848646,6913025442937470981,0,smat-scrapy-crawlers,un,1609564168,comment,0,False,,False,,0,0,0,1,0,me applying to college with sports student council clubs and aps...,[],0,False,0,,,,False,,{},2025-04-29T21:48:34.362389,,,,,
3,refrum_,6709052572635169797,False,6912877935582055682,6913044405964832773,0,smat-scrapy-crawlers,un,1609568586,comment,0,False,,False,,0,0,0,1,0,Mikedy betam wefrsh eko you need sports,[],0,False,0,,,,False,,,2024-10-24T01:37:04.334384,"{'reply_score': 0, 'show_more_score': 0}",,,,
4,rustydog10,6813191040050496518,False,6912862620349385990,6913380706065022982,0,smat-scrapy-crawlers,,1609646883,comment,3,True,"[{'text': 'Liked by creator', 'type': 20}]",False,"[{'aweme_id': '6912862620349385990', 'cid': '6913381169153589254', 'collect_stat': 0, 'comment_language': '', 'create_time': 1609646992, 'digg_count': 0, 'is_author_digged': False, 'label_list': None, 'label_text': 'Creator', 'label_type': 1, 'no_show': False, 'reply_comment': None, 'reply_id': '6913380706065022982', 'reply_to_reply_id': '0', 'share_info': {'acl': {'code': 1, 'extra': '{""item_share_acl"":""empty item value""}'}, 'desc': 'Nicole's comment: 😂🥴', 'title': 'Tell me what you think... is this accurate? #Welcome2021 #welcometotheshitshow #draintheswamp #americafirst #foryou #fyp ￼#foryoupage #thisisamerica', 'url': 'https://m.tiktok.com/v/6912862620349385990.html?_d=0&comment_author_id=146229745385582592&language=en-US&preview_pb=0&share_comment_id=6913381169153589254&share_item...",1,0,0,1,0,omg im si excited we did it finally #1 at something our sports teams suck but i realized we are good at being bad 😳😂😂🤔🙀,[],0,False,0,,,,,,,,,,,,


In [8]:
sentences = df['text'].to_list()
len(sentences)

2000

In [9]:
tgmf = TextGenerationModelFactory()

# Groq Cloud (https://console.groq.com/docs/overview)
gemma_29b_generation_model = tgmf.create_instance('gemma2-9b-it') 
llama_318b_instant_generation_model = tgmf.create_instance('llama-3.1-8b-instant') 
llama_3370b_versatile_generation_model = tgmf.create_instance('llama-3.3-70b-versatile')  
llama_guard_4_12b_generation_model = tgmf.create_instance('meta-llama/llama-guard-4-12b')  

models = [gemma_29b_generation_model, llama_318b_instant_generation_model, llama_3370b_versatile_generation_model, llama_guard_4_12b_generation_model]

In [10]:
import json
import os

def save_to_json(data, path):
    file_number = len([file for file in os.listdir(path) if file.startswith('tiktok_comments-')]) + 1
    file_name = f'tiktok_comments-{file_number}.json'
    file_path = os.path.join(path, file_name)
    with open(file_path, 'w') as f:
        json.dump(data, f)

In [11]:
def detect_predictions_with_llms(sentences: list, notebook_dir: str):
    labels = []
    batch_size = 100
    for batch_idx in tqdm(range(0, len(sentences), batch_size)):
        batch = sentences[batch_idx:batch_idx+batch_size]
        for sentences_idx in tqdm(range(len(batch))):
            sentence = batch[sentences_idx]
            prompt = f"Given this sentence ({sentence}), state if the sentence is a prediction, not a prediction, or not enough information. Do not explain or provide any other details. Only state prediction, not a prediction, or not enough information."
            # print(f"Prompt: {prompt}")
            for model in models:  
                input_prompt = model.user(prompt)
                # print(input_prompt)  
                
                raw_text_llm_generation = model.chat_completion([input_prompt])
                # print(raw_text_llm_generation)
                # print("====================================")
                for line in raw_text_llm_generation.split("\n"):
                    # print(line)
                    if line.strip():
                        labels.append({"sentence": sentence, "model": model.__name__(), "label": line})
        save_dir = os.path.dirname(notebook_dir)
        save_dir = os.path.join(save_dir, 'data', 'tiktok_comments')
        save_to_json(labels, save_dir)
    return labels

In [12]:
llms_generated = detect_predictions_with_llms(sentences, notebook_dir)
llms_generated

100%|██████████| 100/100 [03:34<00:00,  2.14s/it]
100%|██████████| 100/100 [04:13<00:00,  2.54s/it]
  3%|▎         | 3/100 [00:08<04:37,  2.86s/it]t]
 10%|█         | 2/20 [07:56<1:11:32, 238.47s/it]


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.3-70b-versatile` in organization `org_01jf12p7h2f9d8jj9h5fxm2h5d` service tier `on_demand` on tokens per day (TPD): Limit 100000, Used 100007, Requested 89. Please try again in 1m23.412s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}

In [None]:
df = pd.DataFrame(llms_generated).rename(columns={'sentence': 'Text', 'model': 'Model', 'label': 'Label'})
df