# 🤖 GPT-3.5 Auto-Labeling für geopolitische Tweets (neue OpenAI API kompatibel)

In [None]:
!pip install openai pymongo python-dotenv tqdm

In [10]:
import os
import pandas as pd
from pymongo import MongoClient
from dotenv import load_dotenv
from tqdm import tqdm
import time
import json
from openai import OpenAI

In [None]:
load_dotenv()
client_openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
mongo_uri = os.getenv("MONGO_URI")
mongo_client = MongoClient(mongo_uri)
collection = mongo_client["ukraineBiasDB"]["tweets_balanced"]

In [11]:
pipeline = [
    {"$group": {"_id": "$text", "doc": {"$first": "$$ROOT"}}},
    {"$replaceRoot": {"newRoot": "$doc"}},
    {"$project": {"_id": 1, "text": 1, "target": {"$literal": ""}}}
]
cursor = collection.aggregate(pipeline)
df = pd.DataFrame(list(cursor))
df = df[df['text'].notnull()].reset_index(drop=True)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1724 entries, 0 to 1723
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   _id     1724 non-null   object
 1   text    1724 non-null   object
 2   target  1724 non-null   object
dtypes: object(3)
memory usage: 40.5+ KB


Unnamed: 0,_id,text,target
0,67e1ebdb42cf13ec74016b73,Fr multiverse played out.,
1,67e1ebdb42cf13ec74016b76,ยืนยันอีกครั้งว่า สว.ควรชะลอกิจกรรมใดๆที่เกี่ย...,
2,67e1ebdc42cf13ec74016c2c,"""FAFO_FROM_NAFO"" is locked, loaded and ready t...",
3,67e1e66a97acb2988dfdddb9,What's going on with signal that was supposed ...,
4,67e1e66b97acb2988dfdde96,The potential for conflict of interest is not ...,


In [None]:
#Relevanz-Sicherstellung via GPT-API Call
# funktioniert noch nicht deshalb auskommentiert
# 
"""def gpt_filter_relevant(texts):
    prompt = (
        "Du bekommst eine Liste mit Social Media Texten. "
        "Bitte gib nur diejenigen zurück, die sich thematisch mit dem Krieg zwischen Russland und der Ukraine beschäftigen "
        "(z. B. geopolitische Ereignisse, Meinungen, militärische Entwicklungen, etc).\n\n"
        "Gib die Antwort als Liste von JSON-Objekten zurück im Format:\n"
        "{\"text\": \"…\"}\n\n"
        "Texte:\n"
    )
    for i, t in enumerate(texts):
        prompt += f"{i+1}. {t}\n"

    try:
        response = client_openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        return response.choices[0].message.content
    except Exception as e:
        print("❌ Fehler beim Filtern:", e)
        return None

def filter_relevant_texts(df, batch_size=10):
    relevant = []
    total_checked = 0

    for i in tqdm(range(0, len(df), batch_size), desc="🔍 Filter Ukraine/Russia relevante Inhalte"):
        texts = df.iloc[i:i+batch_size]["text"].tolist()
        result = gpt_filter_relevant(texts)
        total_checked += len(texts)

        if result:
            try:
                parsed = json.loads(result)
                relevant.extend(parsed)
            except Exception as e:
                print("⚠️ Fehler beim Parsen (Filter):", e)
                print(result)

        time.sleep(1.5)

    print(f"\n✅ Geprüft: {total_checked}, Relevant: {len(relevant)}, Ausgeschieden: {total_checked - len(relevant)}")

    df_relevant = pd.DataFrame(relevant)
    df_relevant.to_json("filtered_relevant_texts.json", orient="records", indent=2)
    return df_relevant"""

In [None]:
def gpt_label_batch(texts):
    prompt = (
        "Bitte klassifiziere die folgenden Texte auf Basis ihrer geopolitischen Haltung:\n"
        "0 = Pro-Russland\n1 = Neutral\n2 = Pro-Ukraine\n\n"
        "Gib die Antwort bitte als Liste von JSON-Objekten im Format: {\"text\": \"…\", \"label\": 0}\n\n"
        "Texte:\n"
    )
    for i, t in enumerate(texts):
        prompt += f"{i+1}. {t}\n"

    try:
        response = client_openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        return response.choices[0].message.content
    except Exception as e:
        print("❌ GPT API Fehler:", e)
        return None

In [None]:
import os

# Optional: bisherigen Fortschritt laden
partial_file = "labeled_tweets_partial.json"
results = []

if os.path.exists(partial_file):
    df_existing = pd.read_json(partial_file)
    results = df_existing.to_dict(orient="records")
    print(f"📦 Fortsetzung ab Eintrag #{len(results)}")
else:
    df_existing = pd.DataFrame()
    print("📦 Neuer Lauf")
    
batch_size = 10
start_index = len(results)

for i in tqdm(range(start_index, len(df), batch_size)):
    batch = df.iloc[i:i+batch_size]['text'].tolist()
    result = gpt_label_batch(batch)

    if result:
        try:
            parsed = json.loads(result)
            results.extend(parsed)

            # 🔐 Speichern nach jedem erfolgreichen Batch
            df_progress = pd.DataFrame(results)
            df_progress.to_csv("labeled_tweets_partial.csv", index=False)
            df_progress.to_json("labeled_tweets_partial.json", orient="records", indent=2)

        except Exception as e:
            print("⚠️ Parsing-Fehler, Antwort wird übersprungen")
            print("Antwort:", result)
    else:
        print("⚠️ Leere Antwort von GPT")

    time.sleep(1.5)

In [None]:
# Falls ein Abbruch im Langezeit-Job mit der GPT-API passiert hilft dieser Block:
# Ergebnisse bis zum Abbruch sichern
df_partial = pd.DataFrame(results)
df_partial.to_csv("labeled_tweets_partial.csv", index=False)
df_partial.to_json("labeled_tweets_partial.json", orient="records", indent=2)

print(f"💾 {len(df_partial)} Einträge gespeichert.")

In [None]:
#df_out = pd.DataFrame(results)
# Speichern als Datei
#df_out.to_csv("labeled_tweets.csv", index=False)
#df_out.to_json("labeled_tweets.json", orient="records", indent=2)

# Speichern in MongoDB
training_collection = mongo_client["ukraineBiasDB"]["labelled_tweets_training"]
docs = df_partial.to_dict(orient="records")
training_collection.insert_many(docs)

print(f"✅ {len(docs)} Dokumente in 'labelled_tweets_training' gespeichert.")

# Printen der ersten 5 Zeilen zur Kontrolle
df_partial.head()