In [1]:
#!/usr/bin/env python3
import os
from pymongo import MongoClient

def get_mongo_client():
    """
    Retrieves MongoDB URI from the environment variable MONGO_URI.
    """
    uri = os.getenv("MONGO_URI")
    if not uri:
        raise RuntimeError("Please set the MONGO_URI environment variable.")
    return MongoClient(uri)

def find_eligible_content(max_auditors=5):
    """
    Connects to Tel_QA database and finds content IDs with fewer than
    max_auditors total (real audits + pending placeholders).
    """
    client = get_mongo_client()
    db = client["Tel_QA"]
    
    qa_col     = db["QA_pairs"]
    audit_col  = db["audit_logs"]
    assign_col = db["assignment_placeholders"]
    
    # All content IDs defined in QA_pairs
    content_ids = qa_col.distinct("content_id")
    
    eligible = []
    for cid in content_ids:
        real_count    = audit_col.count_documents({"content_id": cid})
        pending_count = assign_col.count_documents({"content_id": cid})
        total         = real_count + pending_count
        
        if total < max_auditors:
            eligible.append({
                "content_id": cid,
                "real_audits": real_count,
                "pending": pending_count,
                "total": total
            })
    
    return eligible

def main():
    eligible = find_eligible_content(max_auditors=5)
    if not eligible:
        print("✅ All content audited!")
    else:
        print("Content IDs still available for auditing (fewer than 5 assignments):")
        for item in eligible:
            print(f" - ID {item['content_id']}: {item['real_audits']} audits, "
                  f"{item['pending']} pending, total {item['total']}")

if __name__ == "__main__":
    main()


Content IDs still available for auditing (fewer than 5 assignments):
 - ID 402: 0 audits, 0 pending, total 0
 - ID 628: 0 audits, 0 pending, total 0
 - ID 679: 0 audits, 0 pending, total 0


In [1]:
#!/usr/bin/env python3
import os
from pymongo import MongoClient

def get_mongo_client():
    """
    Retrieves MongoDB URI from the environment variable MONGO_URI.
    """
    uri = os.getenv("MONGO_URI")
    if not uri:
        raise RuntimeError("Please set the MONGO_URI environment variable.")
    return MongoClient(uri)

def find_left_content(max_auditors=5):
    """
    Returns a list of dicts for each content_id that has been audited by
    fewer than `max_auditors` *distinct* interns.
    """
    client = get_mongo_client()
    db     = client["Tel_QA"]
    qa_col    = db["QA_pairs"]
    audit_col = db["audit_logs"]

    # 1) List all content IDs
    all_ids = qa_col.distinct("content_id")

    left = []
    for cid in all_ids:
        # 2) How many unique interns have audited this content?
        interns = audit_col.distinct("intern_id", {"content_id": cid})
        count   = len(interns)

        if count < max_auditors:
            left.append({
                "content_id":   cid,
                "distinct_audits": count
            })

    return left

def main():
    left = find_left_content(max_auditors=5)

    total_ids = len(left) + sum(1 for _ in find_left_content(max_auditors=0))  # hack to get total
    # Alternatively, fetch total separately:
    # total_ids = len(get_mongo_client()["Tel_QA"]["QA_pairs"].distinct("content_id"))

    if not left:
        print("✅ All content audited!")
    else:
        print(f"IDs Left: {len(left)} out of {total_ids}\n")
        for item in sorted(left, key=lambda x: x["distinct_audits"]):
            print(f" - ID {item['content_id']}: {item['distinct_audits']} distinct audits")

if __name__ == "__main__":
    main()


IDs Left: 212 out of 212

 - ID 402: 0 distinct audits
 - ID 628: 0 distinct audits
 - ID 679: 0 distinct audits
 - ID 701: 1 distinct audits
 - ID 702: 1 distinct audits
 - ID 703: 1 distinct audits
 - ID 704: 1 distinct audits
 - ID 705: 1 distinct audits
 - ID 706: 1 distinct audits
 - ID 707: 1 distinct audits
 - ID 708: 1 distinct audits
 - ID 709: 1 distinct audits
 - ID 710: 1 distinct audits
 - ID 711: 1 distinct audits
 - ID 712: 1 distinct audits
 - ID 713: 1 distinct audits
 - ID 714: 1 distinct audits
 - ID 720: 1 distinct audits
 - ID 726: 1 distinct audits
 - ID 727: 1 distinct audits
 - ID 728: 1 distinct audits
 - ID 729: 1 distinct audits
 - ID 730: 1 distinct audits
 - ID 731: 1 distinct audits
 - ID 732: 1 distinct audits
 - ID 735: 1 distinct audits
 - ID 736: 1 distinct audits
 - ID 738: 1 distinct audits
 - ID 739: 1 distinct audits
 - ID 741: 1 distinct audits
 - ID 745: 1 distinct audits
 - ID 746: 1 distinct audits
 - ID 747: 1 distinct audits
 - ID 748: 1 dist

In [3]:
#!/usr/bin/env python3
import os
from pymongo import MongoClient

def get_mongo_client():
    """
    Retrieves MongoDB URI from the environment variable MONGO_URI.
    """
    uri = os.getenv("MONGO_URI")
    if not uri:
        raise RuntimeError("Please set the MONGO_URI environment variable.")
    return MongoClient(uri)

def cleanup_over_audited(threshold=5):
    """
    Finds content_ids with more than `threshold` distinct interns in audit_logs,
    then removes all entries for those content_ids from both audit_logs and doubt_logs.
    Prints summary of actions taken.
    """
    client = get_mongo_client()
    db = client["Tel_QA"]
    audit_col = db["audit_logs"]
    doubt_col = db["doubt_logs"]

    # 1) Identify content_ids with > threshold distinct interns
    pipeline = [
        {"$group": {"_id": "$content_id", "interns": {"$addToSet": "$intern_id"}}},
        {"$project": {"count": {"$size": "$interns"}}},
        {"$match": {"count": {"$gt": threshold}}}
    ]
    over_audited = list(audit_col.aggregate(pipeline))
    content_ids = [doc["_id"] for doc in over_audited]
    count_ids = len(content_ids)

    print(f"Found {count_ids} content IDs audited by more than {threshold} interns:")
    print(content_ids)

    if count_ids == 0:
        print("No cleanup needed.")
        return

    # 2) Remove from audit_logs and doubt_logs
    res_audit = audit_col.delete_many({"content_id": {"$in": content_ids}})
    res_doubt = doubt_col.delete_many({"content_id": {"$in": content_ids}})

    print(f"Removed {res_audit.deleted_count} documents from 'audit_logs'.")
    print(f"Removed {res_doubt.deleted_count} documents from 'doubt_logs'.")

if __name__ == "__main__":
    cleanup_over_audited(threshold=5)


Found 17 content IDs audited by more than 5 interns:
[401, 514, 338, 60, 415, 177, 413, 287, 249, 511, 257, 14, 282, 573, 374, 166, 98]
Removed 618 documents from 'audit_logs'.
Removed 5 documents from 'doubt_logs'.


In [17]:
#!/usr/bin/env python3
import os
from pymongo import MongoClient

def get_mongo_client():
    """
    Retrieves MongoDB URI from the environment variable MONGO_URI.
    """
    uri = os.getenv("MONGO_URI")
    if not uri:
        raise RuntimeError("Please set the MONGO_URI environment variable.")
    return MongoClient(uri)

def cleanup_intern_logs(interns):
    """
    Finds content_ids that have any logs by the specified interns,
    then removes all entries for those content_ids from audit_logs,
    doubt_logs, and skipped_logs. Prints a summary.
    """
    client    = get_mongo_client()
    db        = client["Tel_QA"]
    audit_col = db["audit_logs"]
    doubt_col = db["doubt_logs"]
    skip_col  = db["skipped_logs"]

    # 1) Identify distinct content_ids logged by any of the interns
    content_ids = set()
    for col in (audit_col, doubt_col, skip_col):
        docs = col.distinct("content_id", {"intern_id": {"$in": interns}})
        content_ids.update(docs)

    count_ids = len(content_ids)
    print(f"Found {count_ids} distinct content IDs logged by interns {interns}:")
    print(sorted(content_ids))

    if count_ids == 0:
        print("No matching logs to remove.")
        return

    # 2) Remove from each collection
    res_audit = audit_col.delete_many({"content_id": {"$in": list(content_ids)}})
    res_doubt = doubt_col.delete_many({"content_id": {"$in": list(content_ids)}})
    res_skip  = skip_col.delete_many({"content_id": {"$in": list(content_ids)}})

    print(f"Removed {res_audit.deleted_count} documents from 'audit_logs'.")
    print(f"Removed {res_doubt.deleted_count} documents from 'doubt_logs'.")
    print(f"Removed {res_skip.deleted_count} documents from 'skipped_logs'.")

if __name__ == "__main__":
    # specify interns whose logs we want to remove
    interns_to_remove = ["katprx"]
    cleanup_intern_logs(interns_to_remove)


Found 6 distinct content IDs logged by interns ['katprx']:


TypeError: '<' not supported between instances of 'int' and 'NoneType'

In [1]:
#!/usr/bin/env python3
import os
from datetime import datetime, timezone
from pymongo import MongoClient
from dotenv import load_dotenv

def main():
    # 1) Load MONGO_URI from .env
    load_dotenv()
    mongo_uri = os.environ.get("MONGO_URI")
    if not mongo_uri:
        raise RuntimeError("Please set MONGO_URI in your .env")

    # 2) Connect
    client = MongoClient(mongo_uri)
    db     = client["Tel_QA"]  # or replace with client["Tel_QA"] if you named it
    skipped = db["skipped_logs"]

    # 3) Build a query for 2025-05-19 and 2025-05-20 UTC
    start = datetime(2025, 5, 19, 0, 0, 0, tzinfo=timezone.utc)
    end   = datetime(2025, 5, 21, 0, 0, 0, tzinfo=timezone.utc)  # exclusive

    result = skipped.delete_many({
        "timestamp": {
            "$gte": start,
            "$lt":  end
        }
    })

    # 4) Report
    print(f"Deleted {result.deleted_count} documents from skipped_logs")

if __name__ == "__main__":
    main()


Deleted 679 documents from skipped_logs


In [3]:
#!/usr/bin/env python3
import os
from pymongo import MongoClient
from dotenv import load_dotenv

def main():
    # 1) Load Mongo URI
    load_dotenv()
    mongo_uri = os.getenv("MONGO_URI")
    if not mongo_uri:
        raise RuntimeError("Please set MONGO_URI in your .env file")

    # 2) Connect to Mongo
    client = MongoClient(mongo_uri)
    # replace "Tel_QA" with your actual DB name if different
    db = client["Tel_QA"]  

    content_col = db["Content"]
    qa_col      = db["QA_pairs"]

    # 3) Fetch all distinct content_ids
    all_content_ids = set(content_col.distinct("content_id"))
    qa_content_ids  = set(qa_col.distinct("content_id"))

    # 4) Compute which content_ids are missing QA pairs
    missing_ids = sorted(all_content_ids - qa_content_ids)

    # 5) Report
    if not missing_ids:
        print("✅ All content entries have matching QA_pairs.")
    else:
        print(f"⚠️ {len(missing_ids)} content_id(s) missing QA pairs:")
        for cid in missing_ids:
            print(f"  - {cid}")

if __name__ == "__main__":
    main()


⚠️ 2 content_id(s) missing QA pairs:
  - 716
  - 737


In [4]:
from pymongo import MongoClient
import os
import pprint
from dotenv import load_dotenv
from zoneinfo import ZoneInfo

# — adjust these as needed —
load_dotenv()
mongo_uri = os.getenv("MONGO_URI")
if not mongo_uri:
    raise RuntimeError("Please set MONGO_URI in your .env file")

DB_NAME         = "Tel_QA"
AUDIT_COLL_NAME = "audit_logs"

# IST timezone
IST = ZoneInfo("Asia/Kolkata")

def format_ist(dt):
    """Convert a UTC-aware datetime to IST and format it."""
    return dt.astimezone(IST).strftime("%Y-%m-%d %H:%M:%S %Z")

def main():
    client     = MongoClient(mongo_uri)
    db         = client[DB_NAME]
    audit_coll = db[AUDIT_COLL_NAME]

    # 1) Find all content_id / intern_id combos with at least one null judgment
    pipeline = [
        {"$match": {"judgment": None}},
        {"$group": {
            "_id": {
                "content_id": "$content_id",
                "intern_id":  "$intern_id"
            }
        }}
    ]
    pairs = list(audit_coll.aggregate(pipeline))

    if not pairs:
        print("No null judgments found.")
        return

    pp = pprint.PrettyPrinter(indent=2)

    # 2) For each pair, fetch all audit docs for that content & intern
    for pair in pairs:
        cid  = pair["_id"]["content_id"]
        iid  = pair["_id"]["intern_id"]
        print(f"\n=== Content ID: {cid}  |  Intern: {iid} ===")

        docs = audit_coll.find({
            "content_id": cid,
            "intern_id":  iid
        }).sort("qa_index", 1)

        for doc in docs:
            # extract original datetimes
            ts_utc       = doc.get("timestamp")
            assigned_utc = doc.get("assigned_at")

            # format into IST
            ts_str       = format_ist(ts_utc) if ts_utc else "–"
            assigned_str = format_ist(assigned_utc) if assigned_utc else "–"

            # print a concise summary
            print(f"QA #{doc['qa_index']}: judgment={doc['judgment']!r}")
            print(f"  • Timestamp   (IST): {ts_str}")
            print(f"  • Assigned at (IST): {assigned_str}")
            print(f"  • Question: {doc['question']}")
            print(f"  • Answer:   {doc['answer']}\n")

if __name__ == "__main__":
    main()



=== Content ID: 709  |  Intern: meesrx ===
QA #0: judgment=None
  • Timestamp   (IST): 2025-05-20 04:44:15 IST
  • Assigned at (IST): 2025-05-20 04:43:17 IST
  • Question: విదేశీ కోడలు రచయిత ఎవరు?
  • Answer:   కోసూరి ఉమాభారతి

QA #1: judgment=None
  • Timestamp   (IST): 2025-05-20 04:44:15 IST
  • Assigned at (IST): 2025-05-20 04:43:17 IST
  • Question: ఈ కథాసంపుటిలో బొమ్మలు ఎవరు వేశారు?
  • Answer:   బాలి

QA #2: judgment=None
  • Timestamp   (IST): 2025-05-20 04:44:15 IST
  • Assigned at (IST): 2025-05-20 04:43:17 IST
  • Question: ఈ కథలు ఎక్కడి జీవితాల ఆధారంగా రాశారు?
  • Answer:   విదేశాలు

QA #3: judgment=None
  • Timestamp   (IST): 2025-05-20 04:44:15 IST
  • Assigned at (IST): 2025-05-20 04:43:17 IST
  • Question: విదేశీ కోడలు ఏ విధమైన దృక్పథాన్ని బోధిస్తుంది?
  • Answer:   మానవతావాదం

QA #4: judgment=None
  • Timestamp   (IST): 2025-05-20 04:44:15 IST
  • Assigned at (IST): 2025-05-20 04:43:17 IST
  • Question: ఈ కథల భాషలో ప్రధాన లక్షణం ఏమిటి?
  • Answer:   అందమైన పదజాలం

QA 

In [5]:
from pymongo import MongoClient
import os
import pprint
from dotenv import load_dotenv
from zoneinfo import ZoneInfo

load_dotenv()
mongo_uri = os.getenv("MONGO_URI")
if not mongo_uri:
    raise RuntimeError("Please set MONGO_URI in your .env file")

DB_NAME         = "Tel_QA"
AUDIT_COLL_NAME = "audit_logs"
IST = ZoneInfo("Asia/Kolkata")

def format_ist(dt):
    return dt.astimezone(IST).strftime("%Y-%m-%d %H:%M:%S %Z")

def main():
    client     = MongoClient(mongo_uri)
    db         = client[DB_NAME]
    audit_coll = db[AUDIT_COLL_NAME]
    export_coll = db["exported_audit_logs"]  # ← new collection

    pipeline = [
        {"$match": {"judgment": None}},
        {"$group": {
            "_id": {
                "content_id": "$content_id",
                "intern_id":  "$intern_id"
            }
        }}
    ]
    pairs = list(audit_coll.aggregate(pipeline))
    if not pairs:
        print("No null judgments found.")
        return

    for pair in pairs:
        cid  = pair["_id"]["content_id"]
        iid  = pair["_id"]["intern_id"]
        print(f"\n=== Content ID: {cid}  |  Intern: {iid} ===")

        docs = list(audit_coll.find({
            "content_id": cid,
            "intern_id":  iid
        }).sort("qa_index", 1))

        for doc in docs:
            ts_str       = format_ist(doc["timestamp"])       if doc.get("timestamp")   else "–"
            assigned_str = format_ist(doc["assigned_at"])     if doc.get("assigned_at") else "–"
            print(f"QA #{doc['qa_index']}: judgment={doc['judgment']!r}")
            print(f"  • Timestamp   (IST): {ts_str}")
            print(f"  • Assigned at (IST): {assigned_str}")
            print(f"  • Question: {doc['question']}")
            print(f"  • Answer:   {doc['answer']}\n")

        # export to new collection
        for d in docs:
            d.pop("_id", None)
        if docs:
            export_coll.insert_many(docs)
            print(f"  → {len(docs)} docs moved to 'exported_audit_logs'")

if __name__ == "__main__":
    main()



=== Content ID: 470  |  Intern: alemax ===
QA #0: judgment=None
  • Timestamp   (IST): 2025-05-09 11:12:15 IST
  • Assigned at (IST): 2025-05-09 11:12:07 IST
  • Question: నేషనల్ ఆర్కైవ్స్ ఆఫ్ ఇండియాకు ఎన్ని రికార్డ్స్ సెంటర్లు ఉన్నాయి?
  • Answer:   మూడు

QA #0: judgment='Correct'
  • Timestamp   (IST): 2025-05-09 11:14:12 IST
  • Assigned at (IST): 2025-05-09 11:12:07 IST
  • Question: నేషనల్ ఆర్కైవ్స్ ఆఫ్ ఇండియాకు ఎన్ని రికార్డ్స్ సెంటర్లు ఉన్నాయి?
  • Answer:   మూడు

QA #0: judgment='Correct'
  • Timestamp   (IST): 2025-05-09 11:14:05 IST
  • Assigned at (IST): 2025-05-09 11:12:07 IST
  • Question: నేషనల్ ఆర్కైవ్స్ ఆఫ్ ఇండియాకు ఎన్ని రికార్డ్స్ సెంటర్లు ఉన్నాయి?
  • Answer:   మూడు

QA #0: judgment='Correct'
  • Timestamp   (IST): 2025-05-09 11:14:00 IST
  • Assigned at (IST): 2025-05-09 11:12:07 IST
  • Question: నేషనల్ ఆర్కైవ్స్ ఆఫ్ ఇండియాకు ఎన్ని రికార్డ్స్ సెంటర్లు ఉన్నాయి?
  • Answer:   మూడు

QA #1: judgment=None
  • Timestamp   (IST): 2025-05-09 11:12:15 IST
  • Assigned at (

In [7]:
from pymongo import MongoClient
from bson import json_util
import os
import json
from dotenv import load_dotenv

# Load environment and config
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI")
if not MONGO_URI:
    raise RuntimeError("Please set MONGO_URI in your .env file")

DB_NAME     = "Tel_QA"
OUTPUT_DIR  = "Tel_QA_backup"

def backup_database(uri, db_name, output_dir):
    """
    Connects to MongoDB and exports every collection in `db_name` 
    to a UTF-8 JSON file in `output_dir`.
    """
    client = MongoClient(uri)
    db     = client[db_name]

    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Iterate over all collections
    for coll_name in db.list_collection_names():
        coll = db[coll_name]
        docs = list(coll.find())  # fetch all documents

        # Prepare filepath
        filepath = os.path.join(output_dir, f"{coll_name}.json")

        # Write out with proper UTF-8 and BSON-to-JSON conversion
        with open(filepath, "w", encoding="utf-8") as f:
            json.dump(
                docs,
                f,
                default=json_util.default,   # serialize ObjectId, DateTime, etc. :contentReference[oaicite:4]{index=4}
                ensure_ascii=False,          # keep Telugu text unescaped :contentReference[oaicite:5]{index=5}
                indent=4
            )
        print(f"Exported {len(docs)} documents from '{coll_name}' to '{filepath}'")

if __name__ == "__main__":
    backup_database(MONGO_URI, DB_NAME, OUTPUT_DIR)


Exported 892 documents from 'QA_pairs' to 'Tel_QA_backup/QA_pairs.json'
Exported 14 documents from 'users' to 'Tel_QA_backup/users.json'
Exported 526 documents from 'exported_audit_logs' to 'Tel_QA_backup/exported_audit_logs.json'
Exported 896 documents from 'Content' to 'Tel_QA_backup/Content.json'
Exported 4409 documents from 'user_logs' to 'Tel_QA_backup/user_logs.json'
Exported 3453 documents from 'system_logs' to 'Tel_QA_backup/system_logs.json'
Exported 23259 documents from 'audit_logs' to 'Tel_QA_backup/audit_logs.json'
Exported 248 documents from 'doubt_logs' to 'Tel_QA_backup/doubt_logs.json'
Exported 561 documents from 'skipped_logs' to 'Tel_QA_backup/skipped_logs.json'
Exported 0 documents from 'assignment_placeholders' to 'Tel_QA_backup/assignment_placeholders.json'


In [1]:
from pymongo import MongoClient
import os
import pprint
from dotenv import load_dotenv
from zoneinfo import ZoneInfo

# — adjust these as needed —
load_dotenv()
mongo_uri = os.getenv("MONGO_URI")
if not mongo_uri:
    raise RuntimeError("Please set MONGO_URI in your .env file")

DB_NAME         = "Tel_QA"
AUDIT_COLL_NAME = "audit_logs"

# IST timezone
IST = ZoneInfo("Asia/Kolkata")

def format_ist(dt):
    """Convert a UTC-aware datetime to IST and format it."""
    return dt.astimezone(IST).strftime("%Y-%m-%d %H:%M:%S %Z")

def main():
    client     = MongoClient(mongo_uri)
    db         = client[DB_NAME]
    audit_coll = db[AUDIT_COLL_NAME]

    # 1) Find all content_id / intern_id combos with at least one null judgment
    pipeline = [
        {"$match": {"judgment": None}},
        {"$group": {
            "_id": {
                "content_id": "$content_id",
                "intern_id":  "$intern_id"
            }
        }}
    ]
    pairs = list(audit_coll.aggregate(pipeline))

    if not pairs:
        print("No null judgments found.")
        return

    pp = pprint.PrettyPrinter(indent=2)

        # … your existing import & setup …

    for pair in pairs:
        cid  = pair["_id"]["content_id"]
        iid  = pair["_id"]["intern_id"]
        print(f"\n=== Content ID: {cid}  |  Intern: {iid} ===")

        docs = audit_coll.find(
            {"content_id": cid, "intern_id": iid}
        ).sort("qa_index", 1)

        for doc in docs:
            # pretty‐print each full document
            pp.pprint(doc)

        # ←— ADD THIS BLOCK TO DELETE THEM ALL:
        result = audit_coll.delete_many({
            "content_id": cid,
            "intern_id":  iid
        })
        print(f"🗑  Deleted {result.deleted_count} documents for content {cid}, intern {iid}")


if __name__ == "__main__":
    main()



=== Content ID: 743  |  Intern: pshubh ===
{ '_id': ObjectId('682c0afa1f84c80f45e2b5b4'),
  'answer': 'మణివణ్ణన్',
  'assigned_at': datetime.datetime(2025, 5, 20, 4, 48, 36, 929000),
  'content_id': 743,
  'intern_id': 'pshubh',
  'judgment': None,
  'length': 'short',
  'qa_index': 0,
  'question': 'గోపాలరావు గారి అబ్బాయి చిత్ర దర్శకుడు ఎవరు?',
  'time_taken': 340.747263,
  'timestamp': datetime.datetime(2025, 5, 20, 4, 54, 17, 677000)}
{ '_id': ObjectId('682c0afa1f84c80f45e2b5b5'),
  'answer': 'ఇళయరాజా',
  'assigned_at': datetime.datetime(2025, 5, 20, 4, 48, 36, 929000),
  'content_id': 743,
  'intern_id': 'pshubh',
  'judgment': None,
  'length': 'short',
  'qa_index': 1,
  'question': 'ఈ చిత్రానికి సంగీతం అందించిన వారు ఎవరు?',
  'time_taken': 340.747263,
  'timestamp': datetime.datetime(2025, 5, 20, 4, 54, 17, 677000)}
{ '_id': ObjectId('682c0afa1f84c80f45e2b5b6'),
  'answer': 'రావు గోపాలరావు',
  'assigned_at': datetime.datetime(2025, 5, 20, 4, 48, 36, 929000),
  'content_id': 743

In [16]:
from pymongo import MongoClient
from dotenv import load_dotenv
import os
import random


# load MONGO_URI from .env
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI")
if not MONGO_URI:
    raise RuntimeError("Please set MONGO_URI in your .env file")

DB_NAME       = "Tel_QA"
MAX_AUDITORS  = 5


client  = MongoClient(MONGO_URI)
db      = client[DB_NAME]

audit_col    = db["audit_logs"]
content_col  = db["Content"]
qa_col       = db["QA_pairs"]

# new “archive” collections
done_content_col = db["completed_content"]
final_audit_col  = db["Final_audit_logs"]

# 1) Find content_ids with >= MAX_AUDITORS distinct interns
pipeline = [
    {"$group": {
        "_id": "$content_id",
        "interns": {"$addToSet": "$intern_id"}
    }},
    {"$addFields": {
        "count": {"$size": "$interns"}
    }},
    {"$match": {
        "count": {"$eq": MAX_AUDITORS}
    }}
]
completed_ids = [doc["_id"] for doc in audit_col.aggregate(pipeline)]
print(f"Found {len(completed_ids)} completed content IDs: {completed_ids}")


    # 2) For each completed content_id: move & delete
for cid in completed_ids:
    # — archive the content document —
    content_doc = content_col.find_one({"content_id": cid})
    if content_doc:
        # upsert into completed_content
        done_content_col.replace_one(
            {"content_id": cid},
            content_doc,
            upsert=True
        )
        content_col.delete_one({"content_id": cid})
        print(f"✓ Archived & removed Content {cid}")

    # — archive all audit_logs —
    audit_docs = list(audit_col.find({"content_id": cid}))

    if audit_docs:
        # final_audit_col.insert_many(audit_docs)
        res = audit_col.delete_many({"content_id": cid})
        print(f"✓ Moved & deleted {res.deleted_count} audit_logs for Content {cid}")


        # — delete its QA_pairs —
        qa_res = qa_col.delete_many({"content_id": cid})
    print(f"✓ Deleted {qa_res.deleted_count} QA_pairs for Content {cid}")
    




Found 0 completed content IDs: []


In [12]:
from pymongo import MongoClient
from dotenv import load_dotenv
import os
import random


# load MONGO_URI from .env
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI")
if not MONGO_URI:
    raise RuntimeError("Please set MONGO_URI in your .env file")

DB_NAME       = "Tel_QA"
MAX_AUDITORS  = 5


client  = MongoClient(MONGO_URI)
db      = client[DB_NAME]

audit_col    = db["audit_logs"]
content_col  = db["Content"]
qa_col       = db["QA_pairs"]

# new “archive” collections
done_content_col = db["completed_content"]
final_audit_col  = db["Final_audit_logs"]

# 1) Find content_ids with >= MAX_AUDITORS distinct interns
pipeline = [
    {"$group": {
        "_id": "$content_id",
        "interns": {"$addToSet": "$intern_id"}
    }},
    {"$addFields": {
        "count": {"$size": "$interns"}
    }},
    {"$match": {
        "count": {"$gt": MAX_AUDITORS}
    }}
]


# … your existing pipeline to get completed_ids …
completed_ids = [doc["_id"] for doc in audit_col.aggregate(pipeline)]
print(f"Found {len(completed_ids)} over-audited content IDs: {completed_ids}")

for cid in completed_ids:
    # 1) get the list of intern_ids who have audited this content
    interns = audit_col.distinct("intern_id", {"content_id": cid})
    
    # 2) if there really are > MAX_AUDITORS, randomly pick one to drop
    if len(interns) > MAX_AUDITORS:
        to_remove = random.choice(interns)
        result = audit_col.delete_many({
            "content_id": cid,
            "intern_id":  to_remove
        })
        print(f"🗑  Removed {result.deleted_count} docs for content {cid}, intern {to_remove}")


Found 0 over-audited content IDs: []


In [2]:

from pymongo import MongoClient
import os
from dotenv import load_dotenv

# — load your Mongo URI from a .env file —
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI")
if not MONGO_URI:
    raise RuntimeError("Please set MONGO_URI in your .env file")

DB_NAME            = "Tel_QA"
AUDIT_COLS         = ["audit_logs", "Final_audit_logs"]
DOUBT_COLL_NAME    = "doubt_logs"

def main():
    client = MongoClient(MONGO_URI)
    db     = client[DB_NAME]

    # get your collections
    audit_cols = [db[name] for name in AUDIT_COLS]
    doubt_col  = db[DOUBT_COLL_NAME]

    # 1) Gather all (content_id, qa_index) → set of interns who did Correct/Incorrect
    correct_groups = {}
    for col in audit_cols:
        for doc in col.find({}, {"content_id":1, "qa_index":1, "intern_id":1}):
            key = (doc["content_id"], doc["qa_index"])
            correct_groups.setdefault(key, set()).add(doc["intern_id"])

    # 2) Gather all (content_id, qa_index) → set of interns who did Doubt
    doubt_groups = {}
    for doc in doubt_col.find({}, {"content_id":1, "qa_index":1, "intern_id":1}):
        key = (doc["content_id"], doc["qa_index"])
        doubt_groups.setdefault(key, set()).add(doc["intern_id"])

    # 3) Find keys present in both—i.e., some interns scored, others doubted
    mixed = []
    for key, corr_interns in correct_groups.items():
        if key in doubt_groups:
            mixed.append((key, corr_interns, doubt_groups[key]))

    # 4) Print out results
    if not mixed:
        print("No QA pairs found where some interns scored Correct/Incorrect and others selected Doubt.")
        return

    for (cid, qa_idx), corr_set, doubt_set in mixed:
        print(f"Content ID: {cid} | QA Index: {qa_idx}")
        print(f"  ➤ Correct/Incorrect by interns: {sorted(corr_set)}")
        print(f"  ➤ Doubt by interns:             {sorted(doubt_set)}")
        print()

if __name__ == "__main__":
    main()


Content ID: 188 | QA Index: 5
  ➤ Correct/Incorrect by interns: ['laamax', 'manmux']
  ➤ Doubt by interns:             ['ysadax']

Content ID: 188 | QA Index: 1
  ➤ Correct/Incorrect by interns: ['laamax', 'ysadax']
  ➤ Doubt by interns:             ['alemax', 'manmux']

Content ID: 190 | QA Index: 1
  ➤ Correct/Incorrect by interns: ['adarex', 'manmux', 'ysadax']
  ➤ Doubt by interns:             ['pradex']

Content ID: 19 | QA Index: 5
  ➤ Correct/Incorrect by interns: ['alemax', 'laamax', 'ysadax']
  ➤ Doubt by interns:             ['pradex']

Content ID: 46 | QA Index: 4
  ➤ Correct/Incorrect by interns: ['gchell', 'meesrx']
  ➤ Doubt by interns:             ['laamax', 'pradex']

Content ID: 449 | QA Index: 3
  ➤ Correct/Incorrect by interns: ['manmux', 'meesrx', 'vinsrx']
  ➤ Doubt by interns:             ['pshubh']

Content ID: 339 | QA Index: 4
  ➤ Correct/Incorrect by interns: ['alemax', 'manmux', 'ysadax']
  ➤ Doubt by interns:             ['ckunas']

Content ID: 321 | QA Inde

In [3]:
from pymongo import MongoClient
import os
from dotenv import load_dotenv

# Load your Mongo URI from .env
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI")
if not MONGO_URI:
    raise RuntimeError("Please set MONGO_URI in your .env file")

DB_NAME = "Tel_QA"

def main():
    client = MongoClient(MONGO_URI)
    db = client[DB_NAME]

    # 1) Get all collection names
    collections = db.list_collection_names()
    print("Collections found:", collections, "\n")

    # 2) For each collection, sample docs and gather all keys
    for coll_name in collections:
        coll = db[coll_name]
        keys = set()

        # sample up to 100 documents (adjust as needed)
        for doc in coll.find({}, projection={"_id": False}).limit(100):
            keys.update(doc.keys())

        print(f"--- {coll_name} ({len(keys)} fields) ---")
        for key in sorted(keys):
            print(f" • {key}")
        print()

if __name__ == "__main__":
    main()


Collections found: ['QA_pairs', 'users', 'Final_audit_logs', 'completed_content', 'Content', 'user_logs', 'system_logs', 'audit_logs', 'doubt_logs', 'skipped_logs', 'assignment_placeholders'] 

--- QA_pairs (5 fields) ---
 • category_status
 • content_id
 • metadata
 • questions
 • uploaded_at

--- users (8 fields) ---
 • auth0_id
 • created_at
 • email
 • first_name
 • intern_id
 • last_name
 • phone
 • picture

--- Final_audit_logs (10 fields) ---
 • answer
 • assigned_at
 • content_id
 • intern_id
 • judgment
 • length
 • qa_index
 • question
 • time_taken
 • timestamp

--- completed_content (4 fields) ---
 • content
 • content_id
 • content_text
 • uploaded_at

--- Content (4 fields) ---
 • content
 • content_id
 • content_text
 • uploaded_at

--- user_logs (4 fields) ---
 • action
 • date
 • details
 • intern_id

--- system_logs (4 fields) ---
 • details
 • event
 • message
 • timestamp

--- audit_logs (10 fields) ---
 • answer
 • assigned_at
 • content_id
 • intern_id
 • judgment

In [1]:
from pymongo import MongoClient
import os
from dotenv import load_dotenv

# 1) load MONGO_URI from .env
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI")
if not MONGO_URI:
    raise RuntimeError("Please set MONGO_URI in your .env file")

# 2) connect
client = MongoClient(MONGO_URI)
db     = client["Tel_QA"]
coll   = db["Content"]

# 3) update all docs that still have 'content'
result = coll.update_many(
    { "content": { "$exists": True } },
    [
      # stage 1: ensure content_text is set
      { 
        "$set": {
          # if content_text already exists/nonnull, keep it;
          # otherwise, pull in the 'content' value
          "content_text": {
            "$ifNull": ["$content_text", "$content"]
          }
        }
      },
      # stage 2: remove the old 'content' field
      { "$unset": "content" }
    ]
)

print(f"Matched docs:  {result.matched_count}")
print(f"Modified docs: {result.modified_count}")


Matched docs:  72
Modified docs: 72


In [2]:
from pymongo import MongoClient
import os
from dotenv import load_dotenv

# Load your Mongo URI from a .env file
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI")
if not MONGO_URI:
    raise RuntimeError("Please set MONGO_URI in your .env file")

DB_NAME          = "Tel_QA"
SKIPPED_COLL_NAME = "skipped_logs"

def main():
    client      = MongoClient(MONGO_URI)
    db          = client[DB_NAME]
    skipped_col = db[SKIPPED_COLL_NAME]

    # Delete all docs with 900 <= content_id <= 950
    result = skipped_col.delete_many({
        "content_id": { "$gte": 900, "$lte": 950 }
    })

    print(f"Deleted {result.deleted_count} documents from '{SKIPPED_COLL_NAME}' "
          f"where content_id is between 900 and 950.")

if __name__ == "__main__":
    main()


Deleted 42 documents from 'skipped_logs' where content_id is between 900 and 950.
