In [1]:
#!/usr/bin/env python3
import os
from pymongo import MongoClient

def get_mongo_client():
    """
    Retrieves MongoDB URI from the environment variable MONGO_URI.
    """
    uri = os.getenv("MONGO_URI")
    if not uri:
        raise RuntimeError("Please set the MONGO_URI environment variable.")
    return MongoClient(uri)

def find_eligible_content(max_auditors=5):
    """
    Connects to Tel_QA database and finds content IDs with fewer than
    max_auditors total (real audits + pending placeholders).
    """
    client = get_mongo_client()
    db = client["Tel_QA"]
    
    qa_col     = db["QA_pairs"]
    audit_col  = db["audit_logs"]
    assign_col = db["assignment_placeholders"]
    
    # All content IDs defined in QA_pairs
    content_ids = qa_col.distinct("content_id")
    
    eligible = []
    for cid in content_ids:
        real_count    = audit_col.count_documents({"content_id": cid})
        pending_count = assign_col.count_documents({"content_id": cid})
        total         = real_count + pending_count
        
        if total < max_auditors:
            eligible.append({
                "content_id": cid,
                "real_audits": real_count,
                "pending": pending_count,
                "total": total
            })
    
    return eligible

def main():
    eligible = find_eligible_content(max_auditors=5)
    if not eligible:
        print("✅ All content audited!")
    else:
        print("Content IDs still available for auditing (fewer than 5 assignments):")
        for item in eligible:
            print(f" - ID {item['content_id']}: {item['real_audits']} audits, "
                  f"{item['pending']} pending, total {item['total']}")

if __name__ == "__main__":
    main()


✅ All content audited!


In [8]:
#!/usr/bin/env python3
import os
from pymongo import MongoClient

def get_mongo_client():
    """
    Retrieves MongoDB URI from the environment variable MONGO_URI.
    """
    uri = os.getenv("MONGO_URI")
    if not uri:
        raise RuntimeError("Please set the MONGO_URI environment variable.")
    return MongoClient(uri)

def find_left_content(max_auditors=5):
    """
    Returns a list of dicts for each content_id that has been audited by
    fewer than `max_auditors` *distinct* interns.
    """
    client = get_mongo_client()
    db     = client["Tel_QA"]
    qa_col    = db["QA_pairs"]
    audit_col = db["audit_logs"]

    # 1) List all content IDs
    all_ids = qa_col.distinct("content_id")

    left = []
    for cid in all_ids:
        # 2) How many unique interns have audited this content?
        interns = audit_col.distinct("intern_id", {"content_id": cid})
        count   = len(interns)

        if count < max_auditors:
            left.append({
                "content_id":   cid,
                "distinct_audits": count
            })

    return left

def main():
    left = find_left_content(max_auditors=5)

    total_ids = len(left) + sum(1 for _ in find_left_content(max_auditors=0))  # hack to get total
    # Alternatively, fetch total separately:
    # total_ids = len(get_mongo_client()["Tel_QA"]["QA_pairs"].distinct("content_id"))

    if not left:
        print("✅ All content audited!")
    else:
        print(f"IDs Left: {len(left)} out of {total_ids}\n")
        for item in sorted(left, key=lambda x: x["distinct_audits"]):
            print(f" - ID {item['content_id']}: {item['distinct_audits']} distinct audits")

if __name__ == "__main__":
    main()


IDs Left: 21 out of 21

 - ID 402: 0 distinct audits
 - ID 628: 0 distinct audits
 - ID 679: 0 distinct audits
 - ID 589: 2 distinct audits
 - ID 673: 2 distinct audits
 - ID 14: 3 distinct audits
 - ID 309: 3 distinct audits
 - ID 338: 3 distinct audits
 - ID 384: 3 distinct audits
 - ID 390: 3 distinct audits
 - ID 406: 3 distinct audits
 - ID 552: 3 distinct audits
 - ID 586: 3 distinct audits
 - ID 602: 3 distinct audits
 - ID 603: 3 distinct audits
 - ID 605: 3 distinct audits
 - ID 634: 3 distinct audits
 - ID 662: 3 distinct audits
 - ID 668: 3 distinct audits
 - ID 672: 3 distinct audits
 - ID 695: 3 distinct audits


In [3]:
#!/usr/bin/env python3
import os
from pymongo import MongoClient

def get_mongo_client():
    """
    Retrieves MongoDB URI from the environment variable MONGO_URI.
    """
    uri = os.getenv("MONGO_URI")
    if not uri:
        raise RuntimeError("Please set the MONGO_URI environment variable.")
    return MongoClient(uri)

def cleanup_over_audited(threshold=5):
    """
    Finds content_ids with more than `threshold` distinct interns in audit_logs,
    then removes all entries for those content_ids from both audit_logs and doubt_logs.
    Prints summary of actions taken.
    """
    client = get_mongo_client()
    db = client["Tel_QA"]
    audit_col = db["audit_logs"]
    doubt_col = db["doubt_logs"]

    # 1) Identify content_ids with > threshold distinct interns
    pipeline = [
        {"$group": {"_id": "$content_id", "interns": {"$addToSet": "$intern_id"}}},
        {"$project": {"count": {"$size": "$interns"}}},
        {"$match": {"count": {"$gt": threshold}}}
    ]
    over_audited = list(audit_col.aggregate(pipeline))
    content_ids = [doc["_id"] for doc in over_audited]
    count_ids = len(content_ids)

    print(f"Found {count_ids} content IDs audited by more than {threshold} interns:")
    print(content_ids)

    if count_ids == 0:
        print("No cleanup needed.")
        return

    # 2) Remove from audit_logs and doubt_logs
    res_audit = audit_col.delete_many({"content_id": {"$in": content_ids}})
    res_doubt = doubt_col.delete_many({"content_id": {"$in": content_ids}})

    print(f"Removed {res_audit.deleted_count} documents from 'audit_logs'.")
    print(f"Removed {res_doubt.deleted_count} documents from 'doubt_logs'.")

if __name__ == "__main__":
    cleanup_over_audited(threshold=5)


Found 17 content IDs audited by more than 5 interns:
[401, 514, 338, 60, 415, 177, 413, 287, 249, 511, 257, 14, 282, 573, 374, 166, 98]
Removed 618 documents from 'audit_logs'.
Removed 5 documents from 'doubt_logs'.


In [2]:
#!/usr/bin/env python3
import os
from pymongo import MongoClient

def get_mongo_client():
    """
    Retrieves MongoDB URI from the environment variable MONGO_URI.
    """
    uri = os.getenv("MONGO_URI")
    if not uri:
        raise RuntimeError("Please set the MONGO_URI environment variable.")
    return MongoClient(uri)

def cleanup_intern_logs(interns):
    """
    Finds content_ids that have any logs by the specified interns,
    then removes all entries for those content_ids from audit_logs,
    doubt_logs, and skipped_logs. Prints a summary.
    """
    client    = get_mongo_client()
    db        = client["Tel_QA"]
    audit_col = db["audit_logs"]
    doubt_col = db["doubt_logs"]
    skip_col  = db["skipped_logs"]

    # 1) Identify distinct content_ids logged by any of the interns
    content_ids = set()
    for col in (audit_col, doubt_col, skip_col):
        docs = col.distinct("content_id", {"intern_id": {"$in": interns}})
        content_ids.update(docs)

    count_ids = len(content_ids)
    print(f"Found {count_ids} distinct content IDs logged by interns {interns}:")
    print(sorted(content_ids))

    if count_ids == 0:
        print("No matching logs to remove.")
        return

    # 2) Remove from each collection
    res_audit = audit_col.delete_many({"content_id": {"$in": list(content_ids)}})
    res_doubt = doubt_col.delete_many({"content_id": {"$in": list(content_ids)}})
    res_skip  = skip_col.delete_many({"content_id": {"$in": list(content_ids)}})

    print(f"Removed {res_audit.deleted_count} documents from 'audit_logs'.")
    print(f"Removed {res_doubt.deleted_count} documents from 'doubt_logs'.")
    print(f"Removed {res_skip.deleted_count} documents from 'skipped_logs'.")

if __name__ == "__main__":
    # specify interns whose logs we want to remove
    interns_to_remove = ["katprx"]
    cleanup_intern_logs(interns_to_remove)


Found 11 distinct content IDs logged by interns ['katprx']:
[702, 709, 721, 732, 736, 739, 740, 741, 748, 783, 804]
Removed 78 documents from 'audit_logs'.
Removed 0 documents from 'doubt_logs'.
Removed 0 documents from 'skipped_logs'.


In [1]:
#!/usr/bin/env python3
import os
from datetime import datetime, timezone
from pymongo import MongoClient
from dotenv import load_dotenv

def main():
    # 1) Load MONGO_URI from .env
    load_dotenv()
    mongo_uri = os.environ.get("MONGO_URI")
    if not mongo_uri:
        raise RuntimeError("Please set MONGO_URI in your .env")

    # 2) Connect
    client = MongoClient(mongo_uri)
    db     = client["Tel_QA"]  # or replace with client["Tel_QA"] if you named it
    skipped = db["skipped_logs"]

    # 3) Build a query for 2025-05-19 and 2025-05-20 UTC
    start = datetime(2025, 5, 19, 0, 0, 0, tzinfo=timezone.utc)
    end   = datetime(2025, 5, 21, 0, 0, 0, tzinfo=timezone.utc)  # exclusive

    result = skipped.delete_many({
        "timestamp": {
            "$gte": start,
            "$lt":  end
        }
    })

    # 4) Report
    print(f"Deleted {result.deleted_count} documents from skipped_logs")

if __name__ == "__main__":
    main()


Deleted 679 documents from skipped_logs


In [3]:
#!/usr/bin/env python3
import os
from pymongo import MongoClient
from dotenv import load_dotenv

def main():
    # 1) Load Mongo URI
    load_dotenv()
    mongo_uri = os.getenv("MONGO_URI")
    if not mongo_uri:
        raise RuntimeError("Please set MONGO_URI in your .env file")

    # 2) Connect to Mongo
    client = MongoClient(mongo_uri)
    # replace "Tel_QA" with your actual DB name if different
    db = client["Tel_QA"]  

    content_col = db["Content"]
    qa_col      = db["QA_pairs"]

    # 3) Fetch all distinct content_ids
    all_content_ids = set(content_col.distinct("content_id"))
    qa_content_ids  = set(qa_col.distinct("content_id"))

    # 4) Compute which content_ids are missing QA pairs
    missing_ids = sorted(all_content_ids - qa_content_ids)

    # 5) Report
    if not missing_ids:
        print("✅ All content entries have matching QA_pairs.")
    else:
        print(f"⚠️ {len(missing_ids)} content_id(s) missing QA pairs:")
        for cid in missing_ids:
            print(f"  - {cid}")

if __name__ == "__main__":
    main()


⚠️ 2 content_id(s) missing QA pairs:
  - 716
  - 737
