# Backtranslation Playground

### Setup

In [2]:
import os
import pandas as pd
import pinecone
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.docstore.document import Document
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain, ConversationChain
from langchain.chains import LLMChain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser
from tqdm import tqdm

import download_ads
# import prompts

  from tqdm.autonotebook import tqdm


# 1️⃣ Simple Backtranslation - No Context

#### Define the templates

In [2]:
template_string_1 = """Translate the solr query text \
that is delimited by triple backticks into a natural \
language description of the query. \
query: ```{query}```
"""

template_string_2 = """First translate the solr query text \
that is delimited by triple backticks into a natural \
language description of the query. \
Then rephrase the description as a request for the item described.
query: ```{query}```
"""

template_string_3 = """Translate the solr query text \
that is delimited by triple backticks into a polite request for \
the same item. \

Additional Context: the solr queries are being made to NASA ADS,\
a search engine for scholarly literature in the domain of the physical sciences.
The request should be phrased as if coming from a human user of ADS.

query: ```{query}```
"""

template_string_4 = """First translate the solr query text \
that is delimited by triple backticks into a natural \
language description of the query. \
Then rephrase the description as a request for the item described.

Additional Context: the solr queries are being made to NASA ADS,\
a search engine for scholarly literature in the domain of the physical sciences.
The request should be phrased as if coming from a human user of ADS.

query: ```{query}```
"""

template_string_5 = """First translate the solr query text \
that is delimited by triple backticks into a natural \
language description of the query. \
Then rephrase the description as a request for the item described.

Additional Context: the solr queries are being made to NASA ADS,\
a search engine for scholarly literature in the domain of the physical sciences.
Generate a request in the style of a human giving an instruction to an AI chatbot.

query: ```{query}```
"""

template_string_6 = """First translate the solr query text \
that is delimited by triple backticks into a natural \
language description of the query. \
Then generate a request for the item in the style \
of a human giving an instruction to an AI chatbot.

Additional Context: the solr queries were made to NASA ADS,\
a search engine for scholarly literature in the domain of the physical sciences.


query: ```{query}```
"""

template_string_7 = """
<instructions>
First translate the solr query text \
that is delimited by triple backticks into a natural \
language description of the query. \
Then rephrase the description as a request for the item described. 
</instructions>

<context>
The solr queries are being made to NASA ADS,\
a search engine for scholarly literature in the domain of the physical sciences.
Generate a request in the style of a human instructing an AI chatbot.
</context>

query: ```{query}```
"""

template_string_7_2 = """
<instructions>
First translate the solr query text \
that is delimited by triple backticks into a natural \
language description of the query. \
Then rephrase the description as a request for the item described. 
</instructions>

<context>
The solr queries are being made to NASA ADS,\
a search engine for scholarly literature in the domain of the physical sciences.
Rephrase the request in the style of an expert human instructing an AI chatbot in short, succinct, commands.
</context>

query: ```{query}```
"""

template_string_7_3 = """
<instructions>
First translate the solr query text \
that is delimited by triple backticks into a natural \
language description of the query. \
Then rephrase the description as a request for the item described in short command. 
</instructions>

<context>
The solr queries are being made to NASA ADS,\
a search engine for scholarly literature in the domain of the physical sciences.
</context>

query: ```{query}```
chatbot request:
"""

template_string_7_4 = """
<instructions>
First translate the solr query text \
that is delimited by triple backticks into a natural \
language description of the query. \
Then rephrase the description as a request for the item described in short command. 

The solr queries were made to NASA ADS.
</instructions>



query: ```{query}```
"""

template_string_8 = """
<instructions>
First translate the solr query text \
that is delimited by triple backticks into a natural \
language description of the query. \
Then rephrase the description as a request for the item described in the query.\
The request should be generated in the style of a human user instructing an AI.\
The human user is a domain expert and should make succinct requests.
</instructions>

<context>
The solr queries are being made to NASA ADS,\
a search engine for scholarly literature in the domain of the physical sciences.
Generate a request in the style of a human instructing an AI chatbot.
The user knows that they are searching the ADS database, and it is assumed\
they are looking for scholarly literature, so these should not be mentioned.
</context>

query: ```{query}```
"""

template_string_9 = """
<instructions>
First translate the solr query text \
that is delimited by triple backticks into a natural \
language description of the query. \
Then rephrase the description as a request for the item described in the query.\
The request should be generated in the style of a human user instructing an AI.\
The human user is a domain expert and should make succinct requests.
</instructions>

<context>
The solr queries are being made to NASA ADS,\
a search engine for scholarly literature in the domain of the physical sciences.
</context>

query: ```{query}```
"""


template_string_10 = """
<instructions>
First translate the solr query text \
that is delimited by triple backticks into a natural \
language description of the query. \
Then rephrase the description as a request for the item described. 
</instructions>

<context>
The solr queries are being made to NASA ADS,\
a search engine for literature in the domain of the physical sciences.\
Generate a request in the style of an expert physicist asking a colleague with background knowledge for information.
</context>

query: ```{query}```
"""

template_string_11 = """
You are a busy scientist who uses NASA ADS to find papers. \
As an expert user of ADS, you know how to interpret the solr \
queries sent to the ADS API and what they mean.\
You are tasked with rephrasing a solr query as a question or command in plain english.\
Since you are very busy, the way you rephrase should be short and only include the key\
words that capture the meaning of the query. Avoid using any redundant words. For example,\
if you specify a year, you do not need to say it is a year. If you specify a person's name, it 
is implied that they are an author.

query: ```{query}```
plain english interpretation: 
"""


ts_1 = '''
Given a specific solr query used in the NASA ADS system, predict \
and rephrase it in natural language as if the user were requesting \
a chatbot to perform the academic paper search for them. The query is \
delimited by triple backticks.\
The AI should \
maintain the original intent and specificity of the solr query, translating \
technical terms and operators into conversational language in as few words as possible that instructs \
the chatbot clearly. The response should reflect the user's need for \
particular information, accommodating various elements like author \
names, publication dates, keywords, etc., present in the original solr request.

query: ```{query}```
'''

ts_2 = '''
Given a specific solr query used in the NASA ADS system, predict \
and rephrase it in natural language as if the user were requesting \
a chatbot to perform the academic paper search for them. The query is \
delimited by triple backticks.\
The AI should \
maintain the original intent and specificity of the solr query, translating \
technical terms and operators into conversational language that instructs \
the chatbot clearly. The response should reflect the user's need for \
particular information, accommodating various elements like author \
names, publication dates, keywords, etc., present in the original solr request.

For instance:
Solr Query: "author:('Hawking, S.') AND title:('Black Holes')"
Chatbot Request: "Can you find papers by Stephen Hawking where he mentions 'Black Holes' in the title?"

Solr Query: "pubdate:[2020-01 TO 2020-12] AND keyword:('Exoplanets')"
Chatbot Request: "Please search for papers published in 2020 that discuss exoplanets."


These examples should help in understanding how to transform the technical \
structure of solr queries into natural, conversational requests \
without losing the specific search criteria.

query: ```{query}```
'''

ts_3 = '''
Given a specific solr query used in the NASA ADS system, predict \
and rephrase it in natural language as if the user were requesting \
a chatbot to perform the academic paper search for them. The query is \
delimited by triple backticks.\
The AI should \
maintain the original intent and specificity of the solr query, translating \
technical terms and operators into direct and to the point conversational \
language that instructs the chatbot clearly. The response should reflect \
the user's need for \
particular information, accommodating various elements like author \
names, publication dates, keywords, etc., present in the original solr request.

For instance:
Solr Query: "author:('Hawking, S.') AND title:('Black Holes')"
Chatbot Request: "Can you find papers by Stephen Hawking where he mentions 'Black Holes' in the title?"

Solr Query: "pubdate:[2020-01 TO 2020-12] AND keyword:('Exoplanets')"
Chatbot Request: "Please search for papers published in 2020 that discuss exoplanets."


These examples should help in understanding how to transform the technical \
structure of solr queries into natural, conversational requests \
without losing the specific search criteria.

query: ```{query}```

chatbot request:
'''

ts_4 = '''
Title: Converting Solr Queries from NASA ADS Usage into Natural Language Requests for a Chatbot

Description: The user provides Solr queries typically used on the NASA Astrophysics Data System (ADS) to search for academic papers. The AI is tasked with interpreting these queries and reformulating them into natural language as if the user were asking a chatbot to perform the search instead. This requires an understanding of both the specific syntax of Solr queries and the more general, conversational tone users employ with chatbots.

Examples:

### Example 1:
**User Input:**
```plaintext
title:"Large Magellanic Cloud" AND year:2023 AND author:"^Smith, J"
```

**AI Output:**
```plaintext
Can you find papers from 2023 titled "Large Magellanic Cloud" with an author whose name starts with "Smith, J"?
```

### Example 2:
**User Input:**
```plaintext
abstract:"exoplanets" AND body:"habitable zones" AND year:[2018 TO 2023]
```

**AI Output:**
```plaintext
Could you search for papers between the years 2018 and 2023 that mention "exoplanets" in the abstract and discuss "habitable zones" in the main text?
```

### Example 3:
**User Input:**
```plaintext
author:"Doe, John" AND keyword:"black holes" AND property:refereed
```

**AI Output:**
```plaintext
Can you find refereed publications by the author "Doe, John" that have keywords related to "black holes"?
```

### Instructions:
1. Analyze the structure of the Solr query delimited by triple backticks to understand what the user is searching for. Pay attention to specific fields such as title, author, year, keywords, properties of the paper, etc.
2. Transform the technical query into a conversational format, as if asking a chatbot to perform the search. Ensure the request is casual and natural-sounding, similar to how people speak in everyday conversation.
3. Maintain all critical information from the original query in the conversational request. Do not add or infer any details not explicitly mentioned in the initial query.
4. Use clear and straightforward language, avoiding jargon or technical terminology unfamiliar to the average person.
5. Be polite in your conversational request, as users typically are when interacting with a service-oriented chatbot.

**Things to Avoid:**
- Do not generate responses that are overly technical or contain jargon that the user may not understand.
- Avoid changing any specific details from the original query; the integrity of the search parameters must remain intact.
- Do not add filler or fluff to the conversational request. Keep it direct and to the point, though friendly and natural.
- Refrain from asking the user follow-up questions in the output; the task is to translate the query, not to continue the conversation.

query: ```{query}```
'''

ts_5 = '''
Given a specific solr query used in the NASA ADS system, transform and rephrase it into more natural, conversational language as if the user were asking a chatbot to perform the academic paper search for them. The AI should convert the original solr query, ensuring it sounds human-like and direct, as if one friend were asking another for a favor, while still preserving the exact intent and specificity of the original request. The transformation should consider casual phrasing, contractions, and common colloquialisms to ensure the request is friendly and relatable.
The solr query to rephrase is delimited by backticks.

Provide examples of solr queries and how they would be rephrased into casual chatbot requests. For instance:

1. Solr Query: "author:('Hawking, S.') AND title:('Black Holes')"
   Chatbot Request: "Can you find any of Stephen Hawking's papers with 'Black Holes' in the title?"

2. Solr Query: "pubdate:[2020-01 TO 2020-12] AND keyword:('Exoplanets')"
   Chatbot Request: "I'm looking for papers about exoplanets published in 2020."

These examples should illustrate the transformation from formal technical queries into relaxed, everyday language, making sure the specific search criteria are communicated in a friendly, approachable manner. The chatbot request should sound like one human asking another for help, rather than a user interacting with a system.

Solr Query: ```{query}```
Chatbot Request:
'''

#### Define the queries

In [14]:
sample_queries = [
    'json\n{\n\t"q": "author:\\"Huchra, John\\" year:1980-1990"\n}',
    'json{"q": "abs:\"neural networks\""}',
    'json{"q": "author:\"Kurtz\" abs:\"weak lensing\""}',
    'json{"q": "author:\"Kurtz\" -abs:\"weak lensing\""}', 
    'q=pubdate:[2020-01+TO+9999-12]+author:("ellis,+g")',
    'q=++author:"^Roman"++year:2021',
    'q=pubdate:[2023-01+TO+2023-12]+-author:("^Asaki")',
    'q=body:SNOLAB+AND+year:2023'
]

#### Do Backtranslation

Observations:
* gpt-3.5 is robust to differences in the formatting of the solr query
* gpt-3.5 is robust to different url encodings and will correctly interpret a '+' as indicating a space and '++' as meaning the '+' character
* The model treats any searchs for papers in 2023 as being in the future

In [15]:
# configuration
template = template_string_11
temperature = 1.0
instruct = False
model = 'gpt-3.5-turbo-instruct' if instruct else 'gpt-4-1106-preview'
prompt_template = ChatPromptTemplate.from_template(template)

In [22]:
import time

def backtranslate_batch(prompt_template, queries):
    # instruct model
    llm = OpenAI(model=model)
    llm_chain = LLMChain(llm=llm, prompt=prompt_template)

    # regular turbo model
    chat = ChatOpenAI(temperature=temperature, model=model)

    nl_queries = []
    for query in tqdm(queries):
        message = prompt_template.format_messages(query=query)
        response = llm_chain.run(query=query) if instruct else chat(message).content 
        nl_queries.append(response)
        time.sleep(0.5)
    return nl_queries

In [23]:
# Try backtranslation with examples
# backtranslate_batch(prompt_template=prompt_template, queries=sample_queries)

### Translate logged queries

In [4]:
df = pd.read_csv('../../data/queries/parsed_queries.csv')
ads_queries = ['q='+query.strip() for query in df['query']]

In [25]:
responses = backtranslate_batch(prompt_template=prompt_template, queries=ads_queries)

100%|██████████| 402/402 [09:02<00:00,  1.35s/it]


In [26]:
stripped_responses = [response.strip("\"") for response in responses]

In [27]:
df_translated = df.copy()
df_translated['translation'] = stripped_responses

In [36]:
queries_path = "../../data/queries/"
df_translated.to_csv(queries_path + "translated_queries.csv", index=True)

In [5]:
#ignore
df_translated = pd.read_csv('../../data/queries/translated_queries.csv')


In [6]:
df_translated

Unnamed: 0.1,Unnamed: 0,status,n_results_found,query,results_json,translation
0,0,0,67.0,ngc 1502,"{'responseHeader': {'status': 0, 'QTime': 304,...",Find papers on NGC 1502.
1,1,0,20079.0,"abs:""Hercules""","{'responseHeader': {'status': 0, 'QTime': 173,...","Find papers with ""Hercules"" in the abstract."
2,2,0,5210.0,"title:""Hercules""","{'responseHeader': {'status': 0, 'QTime': 17, ...","Find papers with ""Hercules"" in the title."
3,3,0,1.0,"author:""^Ogilvie"" year:2004 property:refereed","{'responseHeader': {'status': 0, 'QTime': 52, ...",Find refereed papers from 2004 by the first au...
4,4,0,11.0,"author:""Gardiner, Emiko C.""","{'responseHeader': {'status': 0, 'QTime': 64, ...",Find papers by author Emiko C. Gardiner.
...,...,...,...,...,...,...
397,397,0,1.0,arxiv:2109.13273,"{'responseHeader': {'status': 0, 'QTime': 3, '...",Find the arXiv paper with ID 2109.13273.
398,398,0,15.0,"author:(""^dotto"") abs:(dart)","{'responseHeader': {'status': 0, 'QTime': 23, ...",Find papers by first author Dotto with 'dart' ...
399,399,0,1.0,arxiv:1911.03069,"{'responseHeader': {'status': 0, 'QTime': 13, ...",Find the arXiv paper with ID 1911.03069.
400,400,0,23.0,"author:""^da Cunha, E.""","{'responseHeader': {'status': 0, 'QTime': 24, ...",Find papers by first author E. da Cunha.


In [61]:
import json

def generate_tasks(df: pd.DataFrame, output_path: str):
    for index, row in df.iterrows():
        gen_text = row['translation']
        solr_query = row['query']
        out_dict = {'id': index,
                    'gen_text': gen_text, 
                    'solr_query': solr_query,}
        
        with open(output_path+str(index)+".json", 'w') as f:
            json.dump(out_dict, f)

In [62]:
output_path = '../../data/queries/tasks/'
generate_tasks(df=df_translated, output_path=output_path)


#### Upload to GCP bucket

In [1]:
# !gsutil -m rm -r gs://ads-chat-data/examples/
!gsutil -m cp -r ../../data/queries/tasks/* gs://ads-chat-data/data/unlabeled



Updates are available for some Google Cloud CLI components.  To install them,
please run:
  $ gcloud components update

If you experience problems with multiprocessing on MacOS, they might be related to https://bugs.python.org/issue33725. You can disable multiprocessing by editing your .boto config or by adding the following flag to your command: `-o "GSUtil:parallel_process_count=1"`. Note that multithreading is still available even if you disable multiprocessing.

Copying file://../../data/queries/tasks/0.json [Content-Type=application/json]...
Copying file://../../data/queries/tasks/1.json [Content-Type=application/json]...
Exception in thread Thread-5:iB]                                                
Traceback (most recent call last):
  File "/Users/tannermarsh/opt/miniconda3/lib/python3.9/multiprocessing/managers.py", line 802, in _callmethod
Exception in thread Thread-4:
Traceback (most recent call last):
  File "/Users/tannermarsh/opt/miniconda3/lib/python3.9/multiprocessing

In [45]:
# !gsutil -m rm -r gs://ads-chat-data/labeled-data/

# 2️⃣ Simple Backtranslation - w/ Fields

### Create Prompt

In [14]:
def create_prompt(include_explanation: bool = False) -> ChatPromptTemplate:
    explanation = (
        "The AI should produce the output. The AI should also provide a step-by-step explanation of how it arrived at the output. It is important that the AI provides both the output and the explanation so a human can understand how it created that output."
        if include_explanation
        else "The AI should only output the answer and no additional information."
    )

    chain_of_thought = "Before providing the final answer, the AI should think through the problem step-by-step."

    query_schema = ResponseSchema(
        name="q",
        description="The structured query based on Human input to be sent to ADS",
    )
    response_schemas = [query_schema]
    output_parser = StructuredOutputParser(response_schemas=response_schemas)

    format_instructions = output_parser.get_format_instructions()

    fields = download_ads.get_fields_names()

    operators = download_ads.get_operator_names()

    fields_examples = str(download_ads.get_examples())

    operators_examples = str(
        download_ads.get_operators_info()["name_example_explanation"]
    )

    specific_examples = """
Human: ```json
{
"q": "author:\"Huchra, John\" year:1980-1990"
}
```
AI: finds articles published between 1980 and 1990 by John Huchra

Human: ```json
{
"q": "abs:\"neural networks\""
}
```
AI: What are papers that mention neural networks in the abstract?

Human: ```json
{
"q": "abs:\"neural networks\""
}
```
AI: Give me papers that mention neural networks in the title or keywords or abstract

Human: ```json
{
"q": "body:\"neural networks\""
}
```
AI: Papers with that contain neural networks in the full text

Human: ```json
{
"q": "year:2002-2008"
}
```
AI: Everything from 2002 to 2008

Human: ```json
{
"q": "author:\"Kurtz\" abs:\"weak lensing\""
}
```
AI: What papers by Kurtz, et al discuss weak lensing?
"""

    multi_query_paragraph = download_ads.get_multi_query_paragraph()

    template = """

INSTRUCTIONS: 

The following is a conversation between a human and an AI. The AI should answer the question based on the context, examples, and current conversation provided. If the AI does not know the answer to a question, it truthfully says it does not know. 

{chain_of_thought}


CONTEXT: 

The AI is an expert database search engineer. Specifically, the AI is trained to create interpret queries that are submitted to NASA Astrophysics Data System (ADS), a digital library portal for researchers in astronomy and physics. The ADS system accepts queries using the Apache Solr search syntax. The AI predicts how a human user of the ADS system would express their structured query in natural language.

Here are all available fields and operators in the ADS database, where each field is separated by a space in this list: {fields} {operators}

{multi_query_paragraph}


AVAILABLE FIELDS: 

Here is an example for each of the available fields in the ADS database. The formatting is a Python list of lists. The inner list corresponds to an available field, is five elements long, and each element starts and ends with a single quote e.g. '. The first element is keywords associated with the field, the second element is the query syntax, the third element is the example query, the fourth element is associated notes, and the fifth element is the field name: 
{fields_examples}

AVAILABLE OPERATORS:
Here is an example for each of the available operators in the ADS database. The formatting is a Python list of lists. The inner list corresponds to an available operator, is three elements long, and each element starts and ends with a single quote e.g. '. The first element is the operator name, the second element is the example query, and the third element is associated notes: 
{operators_examples}


EXAMPLES:

The examples below are references for a typical, singular Human and AI interaction that provides the correct answer to a Human question.

{specific_examples}

The AI should create a similar query based on the question from the user.

{explanation}

{format_instructions}

Current conversation:\n{history}\nHuman: {input}\nAI:
"""

    prompt_template = ChatPromptTemplate.from_template(
        template=template,
        partial_variables={
            "format_instructions": format_instructions,
            "fields": fields,
            "operators": operators,
            "fields_examples": fields_examples,
            "operators_examples": operators_examples,
            "specific_examples": specific_examples,  # could not pass this in directly to the PromptTemplate as I was getting a Pydantic ValidationError due to the brackets
            "explanation": explanation,
            "chain_of_thought": chain_of_thought,
            "multi_query_paragraph": multi_query_paragraph,
        },  # required to be either a string value or function that returns string values
        output_parser=output_parser,
    )

    return prompt_template

### Create Model

In [15]:
def create_model(
    include_explanation: bool = False, verbose: bool = False
) -> ConversationChain:
    prompt_template = create_prompt(include_explanation)

    llm = ChatOpenAI(temperature=0.0, openai_api_key=os.environ["OPENAI_API_KEY"])
    memory = ConversationBufferMemory()
    conversation = ConversationChain(
        llm=llm, memory=memory, prompt=prompt_template, verbose=verbose
    )

    return conversation

In [16]:
conversation = create_model()
conversation.predict(
    input='''json\n{\n\t"q": "author:\\"Huchra, John\\" year:1980-1990"\n}'''
)

'```json\n{\n\t"q": "author:\\"Huchra, John\\" year:1980-1990"\n}\n```'

# Few-shot Learning & Fields 