## Get the data

In [1]:
import getpass
import os
import sys
import time
#set openAI api key
#os.environ["OPENAI_API_KEY"] = ""

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from langchain.document_loaders import DataFrameLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain

from langchain.chains.chat_vector_db.prompts import CONDENSE_QUESTION_PROMPT
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders import DataFrameLoader
import json
from langchain.vectorstores import Pinecone
# LLM wrapper
from langchain.chat_models import ChatOpenAI
from langchain import OpenAI

from langchain import SerpAPIWrapper, LLMChain
from langchain.chains.question_answering import load_qa_chain
from langchain.memory import ConversationSummaryBufferMemory
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
# Helper function for printing docs
import textwrap

def pretty_text(text):
    wrapped_text = textwrap.wrap(text, width=100)
    for line in wrapped_text:
        print(line)


def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


### Load the index

In [3]:
def load_embedding_db(index_name):
    from langchain.vectorstores import FAISS
    # You may need to import the embeddings model depending on your application's structure
    # from langchain.embeddings.openai import OpenAIEmbeddings
    embeddings = OpenAIEmbeddings()
    db = FAISS.load_local(index_name, embeddings)
    return db

#db = load_embedding_db("faiss_index_1000_200_1000papers")
#db = load_embedding_db("faiss_index_1000")
db = load_embedding_db("faiss_index_1000_magnetar_OR_fast_radio_burst")
retriever = db.as_retriever(
    search_kwargs={"k":100, "include_metadata": True})
#retriever

In [9]:
from langchain.document_transformers import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor, LLMChainFilter
from langchain.retrievers.document_compressors import EmbeddingsFilter

embeddings = OpenAIEmbeddings()
# ドキュメントを小さな塊に分割する
# splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100, separator=". ")
# 冗長ドキュメントを削除するフィルター
# redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
# 類似度フィルター
relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)
# パイプラインでスプリッターとフィルターを繋ぐ。transformers=[splitter, redundant_filter, relevant_filter]とかにする
pipeline_compressor = DocumentCompressorPipeline(
    transformers=[relevant_filter]
)

compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor, base_retriever=retriever)

from langchain.prompts import PromptTemplate

prompt_template = """
You are Dr. Origins, a specialist in observational studies of magnetars. Your expertise lies in reading and critically interpreting astronomy papers to generate innovative, research-based ideas.
Every idea should commence with "I propose...".

Guidelines:
1. Base your ideas on scientifically recognized theories and principles.
2. Your ideas should be feasibly verifiable and provide avenues for further exploration or research in this field.
3. Abstain from making overly speculative claims or assertions that cannot be empirically tested.
4. Always accurately reference established theories, observational data, or universally accepted astronomical concepts. Do not misrepresent or fabricate scientific references. If you are unsure about a reference, do not use it.
5. Clearly distinguish your ideas from referenced material. Explain how the referenced research inspired your idea.
6. Learn from feedback. Improve and adjust your proposal according to received input.
7. Use less than 250 words.
In response to a human query, generate an informed, precise, and critical response, ensuring your answer's clarity and originality.

Context: {context}
Human: {question}
Dr. Origins: """
 

DRC_PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

#citationありだと何故かエラーを吐いたので消した
doc_template = """--- document start ---
content:{page_content}
--- document end ---
"""

ASTRO_DOC_PROMPT = PromptTemplate(
    template=doc_template,
    input_variables=["page_content"],
)

from langchain.chains import TransformChain, LLMChain, SimpleSequentialChain

model_name = "gpt-4"
llm_qg = ChatOpenAI(temperature=0.2, model_name=model_name)


TEMP = 0.7
llm = ChatOpenAI(temperature=TEMP, model_name=model_name)

# CONDENSE_QUESTION_PROMPT はここまでの履歴を要約してプロンプトを入れてくれるらしい
question_generator = LLMChain(llm=llm_qg, prompt=CONDENSE_QUESTION_PROMPT) # this is the question generator, i probably need to change it to another model instance

#chain_type="stuff", 詰め込み方式 関連するデータをすべて詰め込む
doc_chain = load_qa_chain(
    llm=llm, 
    chain_type="stuff", 
    prompt=DRC_PROMPT, 
    document_prompt=ASTRO_DOC_PROMPT
)

memory = ConversationSummaryBufferMemory(llm=llm, memory_key="chat_history", return_messages=True, output_key="answer")

app_retriever = compression_retriever

chain = ConversationalRetrievalChain(
    retriever=app_retriever,
    question_generator=question_generator,
    combine_docs_chain=doc_chain,
    memory=memory,
    return_source_documents=True,
    max_tokens_limit=7000,
)

In [10]:
query = """
In the field of magnetar research, the relationship between magnetar and fast radio bursts, and how much magnetar is contained in fast radio bursts, are now the focus of much attention. What kind of observation proposals would be effective for observational research using satellites such as JWST (optical), NuSTAR (hard X-ray), and NICER (soft X-ray)?
"""
result = chain({"question": query})

In [11]:
tmp = dict()
tmp["question"] = result["question"]
tmp["answer"] = result["answer"]
tmp["source_documents"] = []
for item in result["source_documents"]:
    tmp2 = dict()
    tmp2["metadata"] = item.metadata
    tmp2["page_content"] = item.page_content
    tmp["source_documents"].append(tmp2)
if not os.path.exists("results"):
    os.makedirs("results")
with open(f'./results/result_{int(time.time())}_magneter_1000_07_e4_a2.json', 'w') as f:
    json.dump(tmp, f, indent=4)

In [12]:
print(result["answer"])

I propose a multi-wavelength observation campaign to study magnetars and their associated fast radio bursts (FRBs). Based on the mentioned studies, FRBs have been linked to magnetars, but the exact mechanism of their emission is still unclear. 

1. We could use the NuSTAR satellite to observe high-energy X-ray emissions from active magnetars, such as SGR J1935+2154, as it has been suggested that these bursts are linked to FRBs (Ridnaia et al. 2020; Tavani et al. 2020). Observing these bursts could help us understand the energy mechanisms behind FRBs.

2. Simultaneously, we could use the NICER satellite to observe the softer X-ray emissions from these magnetars. The correlation between soft X-ray and hard X-ray emissions could provide insight into the magnetar's internal structure and energy distribution.

3. In conjunction with these X-ray observations, we could use the JWST to observe the optical emissions from these magnetars, especially during active periods. This could provide addi

In [13]:
pretty_text(result["answer"])

I propose a multi-wavelength observation campaign to study magnetars and their associated fast radio
bursts (FRBs). Based on the mentioned studies, FRBs have been linked to magnetars, but the exact
mechanism of their emission is still unclear.   1. We could use the NuSTAR satellite to observe
high-energy X-ray emissions from active magnetars, such as SGR J1935+2154, as it has been suggested
that these bursts are linked to FRBs (Ridnaia et al. 2020; Tavani et al. 2020). Observing these
bursts could help us understand the energy mechanisms behind FRBs.  2. Simultaneously, we could use
the NICER satellite to observe the softer X-ray emissions from these magnetars. The correlation
between soft X-ray and hard X-ray emissions could provide insight into the magnetar's internal
structure and energy distribution.  3. In conjunction with these X-ray observations, we could use
the JWST to observe the optical emissions from these magnetars, especially during active periods.
This could provide addi