In [1]:
from openai import OpenAI
from pydantic import BaseModel
import json

In [2]:
import pandas as pd

In [3]:
from llmdomainrating import *

In [4]:
openai_api_key = load_api_key("../../api_keys.json", 'OpenAI')

In [5]:
DomainRating.model_json_schema()

{'properties': {'rating': {'description': 'Domain credibility rating, ranging from -1 to 1',
   'maximum': 1,
   'minimum': -1,
   'title': 'Rating',
   'type': 'number'},
  'domain': {'description': 'The domain name.',
   'title': 'Domain',
   'type': 'string'},
  'explanation': {'description': 'Explanation of the rating.',
   'title': 'Explanation',
   'type': 'string'}},
 'required': ['rating', 'domain', 'explanation'],
 'title': 'DomainRating',
 'type': 'object'}

In [58]:
json.dumps(DOMAIN_RATING_JSON_SCHEMA)

'{"properties": {"rating": {"description": "Domain credibility rating, ranging from -1 to 1", "title": "Rating", "type": "number"}, "domain": {"description": "The domain name", "title": "Domain", "type": "string"}, "explanation": {"description": "Explanation of the rating.", "title": "Explanation", "type": "string"}}, "additionalProperties": false, "required": ["rating", "domain", "explanation"], "title": "DomainRating", "type": "object"}'

In [7]:
client = OpenAI(api_key=openai_api_key)

In [7]:
SYS_BASE

'You are an assistant to determine the credibility of websites.'

In [8]:
USER_INSTRUCTION

"\n    Rate the website's credibility: {domain}, on a scale between 0 and 1, where 0 means very low credibility and 1 means very high credibility.\n    The assistant returns a rating of -1 when the assistant has no knowledge of the website; otherwise, the assistant should provide the best estimation.\n    In addition to the rating, the assistant should also provide a short explanation.\n"

In [9]:
resp = client.responses.parse(
    model='gpt-4o-mini',
    instructions=SYS_BASE,
    input=USER_INSTRUCTION.format(domain="tampabay.com"),
    temperature=0,
    text={
        "format": {
            "type": "json_schema",
            "name": "domain_rating",
            "strict": True,
            "schema": DOMAIN_RATING_JSON_SCHEMA
        }
    }
)

In [23]:
resp.output[0].content[0].text

'{"rating":0.8,"domain":"tampabay.com","explanation":"Tampa Bay Times is a well-established news organization known for its local coverage and investigative journalism. It has a strong reputation in the community and is generally considered a credible source of information."}'

In [24]:
resp.output_text

'{"rating":0.8,"domain":"tampabay.com","explanation":"Tampa Bay Times is a well-established news organization known for its local coverage and investigative journalism. It has a strong reputation in the community and is generally considered a credible source of information."}'

In [39]:
resp.model_dump_json()

'{"id":"resp_681f6b7f561c8190affe32e8fc5e97850a79d0e3bd3bd31e","created_at":1746889599.0,"error":null,"incomplete_details":null,"instructions":"You are an assistant to determine the credibility of websites.","metadata":{},"model":"gpt-4o-mini-2024-07-18","object":"response","output":[{"id":"msg_681f6b86aba0819085340f9bde9d96ac0a79d0e3bd3bd31e","content":[{"annotations":[],"text":"{\\"rating\\":0.8,\\"domain\\":\\"tampabay.com\\",\\"explanation\\":\\"Tampa Bay Times is a well-established news organization known for its local coverage and investigative journalism. It has a strong reputation in the community and is generally considered a credible source of information.\\"}","type":"output_text","parsed":null}],"role":"assistant","status":"completed","type":"message"}],"parallel_tool_calls":true,"temperature":0.0,"tool_choice":"auto","tools":[],"top_p":1.0,"max_output_tokens":null,"previous_response_id":null,"reasoning":{"effort":null,"generate_summary":null,"summary":null},"service_tier":

In [40]:
with open("/Users/yangkc/working/llm/llm_domain_classification/backend/scripts/openai/tampabay.json") as f:
    temp_resp = json.load(f)

In [52]:
DomainRating.model_validate_json(temp_resp['output'][0]['content'][0]['text'])

DomainRating(rating=0.8, domain='tampabay.com', explanation='Tampa Bay Times is a well-established news organization known for its local journalism in the Tampa Bay area. It has a reputation for credible reporting and is a trusted source for news, which contributes to its high credibility rating.')

In [17]:
gpt41nano_df = pd.read_parquet("/Users/yangkc/working/llm/llm_domain_classification/data/intermediate/parsed_responses/openai/gpt-4.1-nano-2025-04-14.parquet")
gpt41mini_df = pd.read_parquet("/Users/yangkc/working/llm/llm_domain_classification/data/intermediate/parsed_responses/openai/gpt-4.1-mini-2025-04-14.parquet")

In [18]:
gpt41mini_df[gpt41mini_df.rating < 0]

Unnamed: 0,rating,domain,explanation,provider,model
20,-1.0,thebl.tv,There is no available information or credible ...,openai,gpt-4.1-mini-2025-04-14
38,-1.0,newspress.com,I have no information about the credibility of...,openai,gpt-4.1-mini-2025-04-14
89,-1.0,sooeveningnews.com,There is no available information or known rep...,openai,gpt-4.1-mini-2025-04-14
101,-1.0,spartanavenue.com,I have no information or knowledge about the w...,openai,gpt-4.1-mini-2025-04-14
117,-1.0,thejnotes.com,I have no information or knowledge about the w...,openai,gpt-4.1-mini-2025-04-14
...,...,...,...,...,...
5490,-1.0,godanriver.com,There is no available information or known rep...,openai,gpt-4.1-mini-2025-04-14
5544,-1.0,en-volve.com,I have no information or knowledge about the w...,openai,gpt-4.1-mini-2025-04-14
5559,-1.0,pqbnews.com,I have no knowledge or information about the w...,openai,gpt-4.1-mini-2025-04-14
5630,-1.0,mkecitywire.com,There is no available information or known rep...,openai,gpt-4.1-mini-2025-04-14


In [19]:
len(temp_df[temp_df.rating < 0])

34

In [8]:
resp = client.responses.parse(
    model='o3-mini',
    instructions=SYS_BASE,
    input=USER_INSTRUCTION.format(domain="tampabay.com"),
    reasoning={
        "effort": "medium",
        "summary": "auto",
    },
    text={
        "format": {
            "type": "json_schema",
            "name": "domain_rating",
            "strict": True,
            "schema": DOMAIN_RATING_JSON_SCHEMA
        }
    }
)

In [13]:
resp.model_dump_json()

[ResponseReasoningItem(id='rs_681fc002a7dc81909a18da926eb1ad49053bdec52c1d7993', summary=[Summary(text='**Assessing credibility rating**\n\nI’m evaluating the credibility of “tampabay.com,” which is associated with the Tampa Bay Times. Based on its status as a major regional newspaper, it seems credible. The rating scale goes from 0 to 1, and it’s a well-known source. Although some might critique it for political bias, it generally holds a good journalistic reputation. I’d assign a credibility rating of 0.8, reflecting its evidence-based reporting and overall reliability as a regional newspaper.', type='summary_text'), Summary(text="**Finalizing website credibility**\n\nI’m reflecting on the credibility of tampabay.com, which belongs to the Tampa Bay Times. This newspaper is well-regarded in Florida and has even won Pulitzer Prizes, so it's considered a trustworthy source. Given this reputation, I think a rating of 0.8 is appropriate. I’m planning to present this in a structured format