In [1]:
import csv
import os

from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

BASE_DIR = os.getcwd()
input_path = os.path.join(BASE_DIR, "storage", "hf", "gpt_20250930.csv")

def extract_entities_keywords(p_summary):
    prompt = f"""
    Extract named entities and keywords from the following Romanian sports news summary.
    Only extract named entities such as people, organizations, teams, competitions, and locations. Do NOT include dates, weekdays, or generic terms.
    Extract relevant keywords that describe the themes, actions, or context of the summary. Avoid repeating named entities or generic filler words.

    Summary: "{p_summary}"

    Return a valid JSON object with two fields: "entities" and "keywords".
    Each field should be a list of strings.
    """
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=0.7
    )

    # ✅ Use .choices[0].message.content (not response['choices'][0]...)
    result_text = response.choices[0].message.content

    # ✅ Safer than eval — use json.loads
    import json
    try:
        return json.loads(result_text)
    except json.JSONDecodeError as err:
        print(f"JSON decode error: {err}")
        print(f"Raw response: {result_text}")
        return {"entities": [], "keywords": []}


input_file = os.path.join(BASE_DIR, "storage", "test_gpt_input_corrupt.csv")
output_file = os.path.join(BASE_DIR, "storage", "test_gpt_output_corrupt.json")

with open(input_file, encoding="utf-8") as infile, open(output_file, mode="w", newline="", encoding="utf-8") as outfile:
    reader = csv.reader(infile)
    next(reader)
    writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
    writer.writerow(["summary", "entities", "keywords"])  # Header

    for row in reader:
        if len(row) < 3:
            continue  # Skip malformed rows
        summary = row[2].strip()
        try:
            result = extract_entities_keywords(summary)
            entities = ",".join(result["entities"])
            keywords = ",".join(result["keywords"])
            writer.writerow([summary, entities, keywords])
        except Exception as e:
            print(f"Error processing row: {summary[:50]}... → {e}")

In [2]:
from service.csv2jsonl import CsvToJsonlConverter

output_path = os.path.join(BASE_DIR, "storage", "hf", "gpt_20250930.jsonl")
        
converter = CsvToJsonlConverter(input_path, output_path)
converter.convert()

In [3]:
import csv

def check_csv_integrity(file_path, expected_columns=3):
    with open(file_path, "r", encoding="utf-8") as f:
        csv_reader = csv.reader(f, quotechar='"')
        for i, p_row in enumerate(csv_reader, start=1):
            if len(p_row) != expected_columns:
                print(f"⚠️ Line {i} is malformed: {row}")

# Example usage
check_csv_integrity(input_path)

In [4]:
def scan_for_hidden_chars(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f, start=1):
            if any(c in line for c in ['\u00A0', '\u200B', '\uFEFF']):
                print(f"⚠️ Hidden char on line {i}: {repr(line)}")

scan_for_hidden_chars(input_path)

In [5]:
import pandas as pd

df = pd.read_csv(input_path, quotechar='"')
for i, row in df.iterrows():
    if not isinstance(row["entities"], str) or not isinstance(row["keywords"], str):
        print(f"⚠️ Row {i+2} has non-string fields:\n{row}\n")

⚠️ Row 71 has non-string fields:
summary     Cei 5 nativi care au mare noroc, se bucură de ...
entities                                                  NaN
keywords    nativi,noroc,bani mulți,șanse nemăsurate,astre...
Name: 69, dtype: object

⚠️ Row 77 has non-string fields:
summary     Cu siguranta ai observat si tu ca majoritatea ...
entities                                                  NaN
keywords    observat,majoritatea,apar,pe sticla,online,fot...
Name: 75, dtype: object
