In [4]:
import spacy
import json
from collections import defaultdict

# Load spaCy model for semantic similarity
nlp = spacy.load("en_core_web_md")


# Simulated analyst reports (replace with file inputs in production)
report_a = [
    {"title": "Tech Stocks Surge on AI Optimism", "summary": "Tech stocks rose 3% driven by AI advancements.", "insight": "Strong buy on AI-focused firms like NVIDIA."},
    {"title": "Federal Reserve Signals Rate Pause", "summary": "Fed indicates no rate hikes until Q3 2025.", "insight": "Positive for bond markets."}
]
report_b = [
    {"title": "AI-Driven Tech Stocks Up 3%", "summary": "Tech sector gained 3% due to AI developments.", "insight": "Cautious on valuation risks in tech."},
    {"title": "Oil Prices Drop Amid Supply Concerns", "summary": "Oil prices fell 2% due to oversupply fears.", "insight": "Bearish on energy sector."}
]

def compute_similarity(text1, text2):
    """Calculate semantic similarity between two texts using spaCy."""
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    return doc1.similarity(doc2)

def extract_tags(summary):
    """Extract tags from summary using keyword mapping."""
    doc = nlp(summary)
    tags = []
    keyword_map = {
        "tech": ["tech", "AI", "technology", "stocks"],
        "economy": ["fed", "rate", "inflation", "economy"],
        "markets": ["stocks", "market", "sector"],
        "energy": ["oil", "energy"]
    }
    for token in doc:
        for tag, keywords in keyword_map.items():
            if token.lower_ in keywords and tag not in tags:
                tags.append(tag.capitalize())
    return tags if tags else ["General"]

def deduplicate_reports(report_a, report_b, similarity_threshold=0.8):
    """Deduplicate reports and merge insights."""
    merged_reports = []
    used_indices_b = set()

    for item_a in report_a:
        matched = False
        for i, item_b in enumerate(report_b):
            if i in used_indices_b:
                continue
            similarity = compute_similarity(item_a["title"], item_b["title"])
            if similarity > similarity_threshold:
                # Merge similar items
                merged_item = {
                    "id": len(merged_reports) + 1,
                    "title": item_a["title"],  # Choose A's title (or improve logic)
                    "summary": item_a["summary"] if len(item_a["summary"]) > len(item_b["summary"]) else item_b["summary"],
                    "tags": list(set(extract_tags(item_a["summary"]) + extract_tags(item_b["summary"]))),
                    "analysts": [
                        {"name": "Analyst A", "insight": item_a["insight"]},
                        {"name": "Analyst B", "insight": item_b["insight"]}
                    ]
                }
                merged_reports.append(merged_item)
                used_indices_b.add(i)
                matched = True
                break
        if not matched:
            # Add non-duplicate item from A
            merged_reports.append({
                "id": len(merged_reports) + 1,
                "title": item_a["title"],
                "summary": item_a["summary"],
                "tags": extract_tags(item_a["summary"]),
                "analysts": [{"name": "Analyst A", "insight": item_a["insight"]}]
            })

    # Add remaining non-duplicate items from B
    for i, item_b in enumerate(report_b):
        if i not in used_indices_b:
            merged_reports.append({
                "id": len(merged_reports) + 1,
                "title": item_b["title"],
                "summary": item_b["summary"],
                "tags": extract_tags(item_b["summary"]),
                "analysts": [{"name": "Analyst B", "insight": item_b["insight"]}]
            })

    return merged_reports

def save_to_json(data, filename="reports.json"):
    """Save merged reports to JSON."""
    with open(filename, "w") as f:
        json.dump(data, f, indent=2)

# Process reports and save output
merged_data = deduplicate_reports(report_a, report_b)
save_to_json(merged_data)

# For demonstration, print the output
print(json.dumps(merged_data, indent=2))

[
  {
    "id": 1,
    "title": "Tech Stocks Surge on AI Optimism",
    "summary": "Tech stocks rose 3% driven by AI advancements.",
    "tags": [
      "Tech",
      "Tech",
      "Markets"
    ],
    "analysts": [
      {
        "name": "Analyst A",
        "insight": "Strong buy on AI-focused firms like NVIDIA."
      }
    ]
  },
  {
    "id": 2,
    "title": "Federal Reserve Signals Rate Pause",
    "summary": "Fed indicates no rate hikes until Q3 2025.",
    "tags": [
      "Economy",
      "Economy"
    ],
    "analysts": [
      {
        "name": "Analyst A",
        "insight": "Positive for bond markets."
      }
    ]
  },
  {
    "id": 3,
    "title": "AI-Driven Tech Stocks Up 3%",
    "summary": "Tech sector gained 3% due to AI developments.",
    "tags": [
      "Tech",
      "Markets"
    ],
    "analysts": [
      {
        "name": "Analyst B",
        "insight": "Cautious on valuation risks in tech."
      }
    ]
  },
  {
    "id": 4,
    "title": "Oil Prices Drop Ami

In [1]:
!pip install spacy





[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
!python -m spacy download en_core_web_md


Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
     ---------------------------------------- 0.0/33.5 MB ? eta -:--:--
     - -------------------------------------- 1.3/33.5 MB 6.7 MB/s eta 0:00:05
     --- ------------------------------------ 2.9/33.5 MB 7.3 MB/s eta 0:00:05
     ----- ---------------------------------- 4.7/33.5 MB 7.5 MB/s eta 0:00:04
     ------- -------------------------------- 6.6/33.5 MB 7.7 MB/s eta 0:00:04
     --------- ------------------------------ 8.1/33.5 MB 7.7 MB/s eta 0:00:04
     ----------- ---------------------------- 9.7/33.5 MB 7.6 MB/s eta 0:00:04
     ------------- -------------------------- 11.0/33.5 MB 7.5 MB/s eta 0:00:04
     -------------- ------------------------- 12.3/33.5 MB 7.4 MB/s eta 0:00:03
     ---------------- ----------------------- 13.6/33.5 MB 7.1 MB/s eta 0:00:03
     ------------------ --------------


[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
