In [None]:
!pip install spacy


In [None]:
!python -m spacy download en_core_web_md

In [None]:
import spacy
import json
from collections import defaultdict
from datetime import datetime


nlp = spacy.load("en_core_web_md")

def parse_report_file(filename):
    """Parse a text file into a list of report items."""
    reports = []
    current_item = {}
    with open(filename, "r", encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            if not line or line == "---":
                if current_item and "title" in current_item and "summary" in current_item and "insight" in current_item:
                    reports.append(current_item)
                    current_item = {}
                continue
            if line.startswith("Title:"):
                current_item["title"] = line.replace("Title:", "").strip()
            elif line.startswith("Summary:"):
                current_item["summary"] = line.replace("Summary:", "").strip()
            elif line.startswith("Insight:"):
                current_item["insight"] = line.replace("Insight:", "").strip()
            elif line.startswith("Timestamp:"):
                current_item["timestamp"] = line.replace("Timestamp:", "").strip()
    if current_item and "title" in current_item and "summary" in current_item and "insight" in current_item:
        reports.append(current_item)
    return reports

def compute_similarity(text1, text2):
    """Calculate semantic similarity between two texts using spaCy."""
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    return doc1.similarity(doc2)

def extract_tags(summary):
    """Extract tags from summary using keyword mapping."""
    doc = nlp(summary)
    tags = []
    keyword_map = {
        "Tech": ["tech", "ai", "technology", "stocks"],
        "Economy": ["fed", "rate", "inflation", "economy"],
        "Markets": ["stocks", "market", "sector"],
        "Energy": ["oil", "energy"]
    }
    for token in doc:
        for tag, keywords in keyword_map.items():
            if token.lower_ in keywords and tag not in tags:
                tags.append(tag)
    return tags if tags else ["General"]

def extract_numbers(summary):
    """Extract numerical entities (e.g., percentages) from summary."""
    doc = nlp(summary)
    numbers = [ent.text for ent in doc.ents if ent.label_ in ["PERCENT", "CARDINAL", "MONEY"]]
    return numbers

def resolve_conflict(summary_a, summary_b, item_a, item_b):
    """Resolve conflicting summaries."""
    numbers_a = extract_numbers(summary_a)
    numbers_b = extract_numbers(summary_b)
    

    if numbers_a != numbers_b:
        timestamp_a = item_a.get("timestamp")
        timestamp_b = item_b.get("timestamp")
        if timestamp_a and timestamp_b:
            try:
                time_a = datetime.strptime(timestamp_a, "%Y-%m-%d %H:%M:%S")
                time_b = datetime.strptime(timestamp_b, "%Y-%m-%d %H:%M:%S")
                return summary_a if time_a > time_b else summary_b, None
            except ValueError:
                pass

        if any("%" in num for num in numbers_a + numbers_b):
            numbers = [num for num in numbers_a + numbers_b if "%" in num]
            if numbers:
                min_num = min(float(num.rstrip("%")) for num in numbers)
                max_num = max(float(num.rstrip("%")) for num in numbers)
                merged_summary = summary_a.replace(numbers_a[0], f"{min_num}-{max_num}%") if numbers_a else summary_b
                return merged_summary, None

        return f"{summary_a} (Analyst A); {summary_b} (Analyst B)", "Conflicting data detected, review required."
    
   
    return summary_a if len(summary_a) > len(summary_b) else summary_b, None

def deduplicate_reports(report_a, report_b, similarity_threshold=0.8):
    """Deduplicate reports and merge insights with conflict resolution."""
    merged_reports = []
    used_indices_b = set()

    for item_a in report_a:
        matched = False
        for i, item_b in enumerate(report_b):
            if i in used_indices_b:
                continue
            if "title" not in item_a or "title" not in item_b:
                continue  
            similarity = compute_similarity(item_a["title"], item_b["title"])
            if similarity > similarity_threshold:
                
                merged_summary, conflict_note = resolve_conflict(item_a["summary"], item_b["summary"], item_a, item_b)
                
                merged_item = {
                    "id": len(merged_reports) + 1,
                    "title": item_a["title"],
                    "summary": merged_summary,
                    "tags": list(set(extract_tags(item_a["summary"]) + extract_tags(item_b["summary"]))),
                    "analysts": [
                        {"name": "Analyst A", "insight": item_a["insight"]},
                        {"name": "Analyst B", "insight": item_b["insight"]}
                    ]
                }
                if conflict_note:
                    merged_item["conflict_note"] = conflict_note
                merged_reports.append(merged_item)
                used_indices_b.add(i)
                matched = True
                break
        if not matched:
            
            merged_reports.append({
                "id": len(merged_reports) + 1,
                "title": item_a["title"],
                "summary": item_a["summary"],
                "tags": extract_tags(item_a["summary"]),
                "analysts": [{"name": "Analyst A", "insight": item_a["insight"]}]
            })

    
    for i, item_b in enumerate(report_b):
        if i not in used_indices_b:
            merged_reports.append({
                "id": len(merged_reports) + 1,
                "title": item_b["title"],
                "summary": item_b["summary"],
                "tags": extract_tags(item_b["summary"]),
                "analysts": [{"name": "Analyst B", "insight": item_b["insight"]}]
            })

    return merged_reports

def save_to_json(data, filename="reports.json"):
    """Save merged reports to JSON."""
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)

try:
    report_a = parse_report_file("Analyst1.txt")
    report_b = parse_report_file("Analyst2.txt")
    merged_data = deduplicate_reports(report_a, report_b)
    save_to_json(merged_data)
    print("Successfully generated reports.json")
    print(json.dumps(merged_data, indent=2))
except FileNotFoundError as e:
    print(f"Error: File not found - {e}")
except Exception as e:
    print(f"Error processing reports: {e}")

Error: File not found - [Errno 2] No such file or directory: 'Analyst1.txt'
