In [None]:
import json
import logging
from typing import List, Dict
from elasticsearch import Elasticsearch, helpers
from sentence_transformers import SentenceTransformer

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def load_jsonl_data(file_path: str) -> List[Dict]:
    """Load JSONL data with robust error handling."""
    articles = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            if isinstance(data, list):
                articles = data
            elif isinstance(data, dict):
                articles = [data]
            else:
                raise ValueError("Unexpected data structure")
            logger.info("Loaded as single JSON array")
    except json.JSONDecodeError as e:
        logger.warning(f"Single JSON load failed: {e}. Falling back to JSONL parsing.")
        with open(file_path, 'r', encoding='utf-8') as f:
            for line_number, line in enumerate(f, 1):
                line = line.strip()
                if not line:
                    logger.debug(f"Line {line_number}: Skipping empty line")
                    continue
                try:
                    articles.append(json.loads(line))
                except json.JSONDecodeError:
                    logger.error(f"Line {line_number}: Skipping invalid line: {line[:100]}...")
                    continue
    logger.info(f"Loaded {len(articles)} articles from {file_path}")
    return articles

def generate_embedding(text: str, model: SentenceTransformer) -> List[float]:
    """Generate embedding for the given text using the SentenceTransformer model."""
    try:
        embedding = model.encode(text).tolist()
        logger.debug(f"Generated embedding for text (length: {len(embedding)})")
        return embedding
    except Exception as e:
        logger.error(f"Failed to generate embedding: {e}")
        return []

def create_elasticsearch_index(es: Elasticsearch, index_name: str):
    """Create Elasticsearch index if it doesn't exist."""
    try:
        if not es.indices.exists(index=index_name):
            es.indices.create(
                index=index_name,
                body={
                    "mappings": {
                        "properties": {
                            "guid": {"type": "keyword"},
                            "title": {"type": "text"},
                            "description": {"type": "text"},
                            "keywords": {"type": "text"},
                            "embedding": {
                                "type": "dense_vector",
                                "dims": 384  # all-MiniLM-L6-v2 dimension
                            }
                        }
                    }
                }
            )
            logger.info(f"Created Elasticsearch index: {index_name}")
        else:
            logger.info(f"Index {index_name} already exists")
    except Exception as e:
        logger.error(f"Failed to create index {index_name}: {e}")
        raise

def index_articles(es: Elasticsearch, articles: List[Dict], model: SentenceTransformer, index_name: str, chunk_size: int = 100):
    """Index articles to Elasticsearch in chunks with fallback for short descriptions."""
    def generate_actions(articles_chunk: List[Dict]):
        for article in articles_chunk:
            guid = article.get("guid")
            title = article.get("title", "")
            description = article.get("description", "")
            keywords = ", ".join(article.get("keywords", [])) if article.get("keywords") else ""

            # Fallback logic for short or missing descriptions
            if not description or len(description.strip()) < 20:
                text_for_embedding = f"{title}. {keywords}".strip()
                logger.warning(f"Short/missing description for GUID {guid}, using title and keywords")
            else:
                text_for_embedding = f"{title}. {description}. {keywords}".strip()

            embedding = generate_embedding(text_for_embedding, model)
            if not embedding:
                logger.warning(f"Skipping article with GUID {guid} due to embedding failure")
                continue

            yield {
                "_index": index_name,
                "_id": guid,
                "_source": {
                    "guid": guid,
                    "title": title,
                    "description": description,
                    "keywords": keywords,
                    "embedding": embedding
                }
            }

    total_indexed = 0
    for i in range(0, len(articles), chunk_size):
        chunk = articles[i:i + chunk_size]
        try:
            success, failed = helpers.bulk(es, generate_actions(chunk))
            total_indexed += success
            logger.info(f"Indexed {success} articles in chunk {i//chunk_size + 1}")
            if failed:
                logger.error(f"Failed to index {len(failed)} articles in chunk {i//chunk_size + 1}")
        except Exception as e:
            logger.error(f"Error indexing chunk {i//chunk_size + 1}: {e}")
    logger.info(f"Total articles indexed: {total_indexed}")

def search_recommendations(es: Elasticsearch, model: SentenceTransformer, index_name: str, query: str, top_k: int = 5, filters: Dict = None) -> List[Dict]:
    """Search for article recommendations based on a query string."""
    # Generate embedding for the query
    query_embedding = generate_embedding(query, model)
    if not query_embedding:
        logger.error("Failed to generate query embedding")
        return []

    # Build Elasticsearch query
    es_query = {
        "query": {
            "script_score": {
                "query": {"match_all": {}} if not filters else filters,
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                    "params": {"query_vector": query_embedding}
                }
            }
        },
        "size": top_k
    }

    try:
        response = es.search(index=index_name, body=es_query)
        hits = response['hits']['hits']
        recommendations = [
            {
                "guid": hit["_id"],
                "title": hit["_source"]["title"],
                "description": hit["_source"]["description"],
                "keywords": hit["_source"]["keywords"],
                "score": hit["_score"]
            }
            for hit in hits
        ]
        logger.info(f"Retrieved {len(recommendations)} recommendations for query: {query[:50]}...")
        return recommendations
    except Exception as e:
        logger.error(f"Search failed: {e}")
        return []

def main():
    """Main function to index articles and demonstrate recommendation search."""
    # Configuration
    es_host = "http://192.168.0.123:9200"
    es_auth = ("elastic", "elastic")  # Update with your credentials
    index_name = "article_recommender"
    jsonl_file = "C:\\workspace\\NCF_minds\\redfin_label_api\\data\\output\\articles_with_keywords.jsonl"

    # Initialize Elasticsearch client
    try:
        es = Elasticsearch(
            es_host,
            basic_auth=es_auth,
            verify_certs= False,
            request_timeout=30
        )
        info = es.info()
        logger.info(f"Connected to Elasticsearch: {info['cluster_name']}")
    except Exception as e:
        logger.error(f"Elasticsearch connection failed: {e}")
        raise

    # Load embedding model
    try:
        model = SentenceTransformer("all-MiniLM-L6-v2")
        logger.info("Loaded SentenceTransformer model")
    except Exception as e:
        logger.error(f"Failed to load SentenceTransformer model: {e}")
        raise

    # Create index
    create_elasticsearch_index(es, index_name)

    # Load and index articles
    articles = load_jsonl_data(jsonl_file)
    if not articles:
        logger.error("No valid articles loaded. Exiting.")
        return
    index_articles(es, articles, model, index_name, chunk_size=100)

    # Example recommendation searches
    # 1. Text-based query
    query = "AI in medical research"
    recommendations = search_recommendations(es, model, index_name, query, top_k=5)
    print("\nRecommendations for query: 'AI in medical research'")
    for rec in recommendations:
        print(f"- {rec['title']} (Score: {rec['score']:.3f})")

    # 2. Article-based query (using an existing article's content)
    sample_article = articles[0]
    query = f"{sample_article['title']}. {sample_article['description']}. {', '.join(sample_article['keywords'])}"
    recommendations = search_recommendations(es, model, index_name, query, top_k=5)
    print(f"\nRecommendations for article: '{sample_article['title']}'")
    for rec in recommendations:
        print(f"- {rec['title']} (Score: {rec['score']:.3f})")

    # 3. Filtered query (e.g., only articles from 'frontier_lab' group)
    filters = {"match": {"group": "frontier_lab"}}
    recommendations = search_recommendations(es, model, index_name, query, top_k=5, filters=filters)
    print(f"\nRecommendations for article: '{sample_article['title']}' (filtered by group: frontier_lab)")
    for rec in recommendations:
        print(f"- {rec['title']} (Score: {rec['score']:.3f})")

    logger.info("✅ Articles indexed and recommendations generated!")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
2025-09-01 10:26:00,452 - INFO - GET http://192.168.0.123:9200/ [status:200 duration:0.015s]
2025-09-01 10:26:00,452 - INFO - Connected to Elasticsearch: docker-cluster
2025-09-01 10:26:00,452 - INFO - Use pytorch device_name: cpu
2025-09-01 10:26:00,452 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-09-01 10:26:06,088 - INFO - Loaded SentenceTransformer model
2025-09-01 10:26:06,088 - INFO - HEAD http://192.168.0.123:9200/article_recommender [status:404 duration:0.000s]
2025-09-01 10:26:12,613 - INFO - PUT http://192.168.0.123:9200/article_recommender [status:200 duration:6.525s]
2025-09-01 10:26:12,613 - INFO - Created Elasticsearch index: article_recommender
2025-09-01 10:26:12,665 - INFO - Loaded 1694 articles from C:\workspace\NCF_minds\redfin_label_api\Data\output\articles_with_keywords.jsonl
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.54it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.93it/s]
Batches: 


Recommendations for query: 'AI in medical research'
- Developing reliable AI tools for healthcare (Score: 1.694)
- AI Analyzes Nurses’ Observations to Reduce Patient Danger (Score: 1.591)
- MCP for Research: How to Connect AI to Research Tools (Score: 1.573)
- A catalogue of genetic mutations to help pinpoint the cause of diseases (Score: 1.564)
- Using AI to improve patient access to clinical trials (Score: 1.561)


Batches: 100%|██████████| 1/1 [00:00<00:00, 59.18it/s]
2025-09-01 10:27:17,908 - INFO - POST http://192.168.0.123:9200/article_recommender/_search [status:200 duration:0.012s]
2025-09-01 10:27:17,912 - INFO - Retrieved 5 recommendations for query: Accelerating life sciences research. Discover how ...



Recommendations for article: 'Accelerating life sciences research'
- Accelerating life sciences research (Score: 2.000)
- Preparing for future AI risks in biology (Score: 1.577)
- Strengthening America’s AI leadership with the U.S. National Laboratories (Score: 1.545)
- OpenAI Scholars 2019: Meet our Scholars (Score: 1.527)
- Accelerating the development of life-saving treatments (Score: 1.515)


Batches: 100%|██████████| 1/1 [00:00<00:00, 34.47it/s]
2025-09-01 10:27:17,966 - INFO - POST http://192.168.0.123:9200/article_recommender/_search [status:200 duration:0.014s]
2025-09-01 10:27:17,966 - INFO - Retrieved 0 recommendations for query: Accelerating life sciences research. Discover how ...
2025-09-01 10:27:17,966 - INFO - ✅ Articles indexed and recommendations generated!



Recommendations for article: 'Accelerating life sciences research' (filtered by group: frontier_lab)
