In [None]:
import os
os.environ["OPENAI_API_KEY"]="Your_GPT_key_here"

In [None]:
from dokument import Dokument
from synthesis import Synthesis

import pickle
# load your Synthesis object with existing .topic and .eligible_documents attributes
with open("synthesis_task.pkl", "rb") as file:
    synthesis_task = pickle.load(file)
print("Research topic to be synthesized: ", synthesis_task.topic)

In [None]:
# Define the template for the coding of thematic analysis of the research topic
code_template = ("""
Consider the research topic:
{topic}

And the text input from a publication:
Text inputs:
{docs}

Please perform the following tasks of the coding step for a thematic analysis of the research topic:

Identify and Assign Codes:
For each segment of text that is relevant to the research topic, identify and assign a code.
Provide Code Details:
For each code, output the following information:

Code Name: A concise, descriptive name for the code.
Code Definition: A brief explanation of what the code represents.
Example: An illustrative quote or excerpt that exemplifies the code.

Example Output 1:
**Code Name**: Product Quality
**Code Definition**: Describes the perceived value and performance of a product.
**Example**: Many reviews highlighted the excellent build quality and durability of the product.

Example Output 2:
**Code Name**: User Feedback
**Code Definition**: "Encompasses comments and suggestions provided by users to improve the product or service.
**Example**: "Users suggested adding more customization options to enhance their experience.
""")

# Format the template with the specific research topic and placeholder string for document text
code_template_formatted = code_template.format(topic=synthesis_task.topic, docs="{docs}")

# Create a Prompt Template using the formatted code template
from langchain.prompts import ChatPromptTemplate
codes_prompt = ChatPromptTemplate.from_template(code_template_formatted)

# Define the language model with 0 temperature
from langchain_openai import ChatOpenAI
gpt4_model = ChatOpenAI(temperature=0, model_name="gpt-4-0125-preview")

# Define output parser to handle output
from langchain.schema.output_parser import StrOutputParser
str_output_parser = StrOutputParser()

# Final chain for determining eligibility
codes_chain = codes_prompt | gpt4_model | str_output_parser

In [None]:
# Define an asynchronous function to process and assign codes using list of eligible documents input
async def codes_from_raw_data(dokument_list):
    # Create list of raw data from the document list for input to the chain
    raw_data_list = [doc.raw_data for doc in dokument_list]
    
    # Run the codes chain on the input
    codes_list = await codes_chain.abatch(raw_data_list)
    
    # Assign the results to each document and print the codes
    for i in range(len(dokument_list)):
        # Save the coded data to the documents in the list
        dokument_list[i].codes = codes_list[i]
        print(f"Document {i+1}:")
        print("Codes:", dokument_list[i].codes)
        print("-" * 30)  # Separator

# Usage: Run the function on the list of eligible documents
await codes_from_raw_data(synthesis_task.eligible_documents)

In [None]:
# Define the template for identifying themes from coded data that are relevant to the research topic
themes_template = ("""
Consider the research topic: {topic}

You will receive input of coded data from a research document, which contains categorized segments of text.
For the process of a thematic analysis of the research topic,
identify overarching themes that encapsulate common patterns present across the coded segments. 
Only include themes that are relevant to the research topic. 

Coded data input:
{docs}

Example output 1:
**Theme Name**: Product Quality Perception

**Description**: Overall assessment of how customers perceive the quality of the product, 
encompassing both high quality and poor quality.

Example output 2:
**Theme Name**: Customer Experience

**Description**: Includes all aspects of the customer’s interaction with the product and service, 
such as satisfaction and dissatisfaction.
""")

# Format the template with the specified research topic and a placeholder string for the coded data input
themes_template_formatted = themes_template.format(topic=synthesis_task.topic, docs="{docs}")

# Create a Prompt Template using the formatted template
themes_prompt = ChatPromptTemplate.from_template(themes_template_formatted)

# Final chain for identifying themes of the documents
themes_chain = themes_prompt | gpt4_model | str_output_parser

In [None]:
# Define asynchronous function to identify themes using coded data input
async def themes_from_codes(dokument_list):
    # Create list of coded data for input to the chain
    codes_list = [doc.codes for doc in dokument_list]
    
    # Run the themes identifying chain on the input
    themes_list = await themes_chain.abatch(codes_list)

    for i in range(len(dokument_list)):
        # Save the themes to the documents in the list
        dokument_list[i].themes = themes_list[i]
        print(f"Document {i+1}:")
        print("Themes:", dokument_list[i].themes)
        print("-" * 30)  # Separator

# Usage: Run the function on the list of eligible documents
await themes_from_codes(synthesis_task.eligible_documents)

In [None]:
# Prepare a string variable to be used as input to the final synthesis chain
themes_output = ""

# Iterate over each eligible document to append its themes to the output variable
for i in range(len(synthesis_task.eligible_documents)):
    themes_output += f"Document {i+1}:\n"
    themes_output += f"Themes: {synthesis_task.eligible_documents[i].themes}\n"
    themes_output += "-" * 30 + "\n"  # Add a separator for better readability

# Count number of tokens in the accumulated themes output
num_tokens = gpt4_model.get_num_tokens(themes_output)

# Check if the number of tokens exceeds the model's limit and print debug message
if num_tokens > 128000:
    print("The output is too long for this model.")
else:
    print(themes_output)

In [None]:
# Define the template for synthesizing overarching themes based on identified themes
synthesis_template = ("""
Consider the research topic: {topic}

For the process of thematic analysis regarding the research topic,
You will receive the input: A list of themes derived from coded segments from various 
related research documents.
Review the list of identified themes and synthesize a final list of overarching themes that 
address the research topic. 
Keep track of the context, document your rationale, and 
include references to the document numbers where each theme is found.

List of themes input:
{themes}

""")

# Format the template with the specific research topic and a placeholder string for the themes input
synthesis_template_formatted = synthesis_template.format(topic=synthesis_task.topic, themes="{themes}")

# Create a Prompt Template using the formatted template
synthesis_prompt = ChatPromptTemplate.from_template(synthesis_template_formatted)

# Final chain for synthesizing the themes of the document 
synthesis_chain = synthesis_prompt | gpt4_model | str_output_parser

In [None]:
# Invoke chain to synthesize a result based on the themes
synthesis_result = synthesis_chain.invoke(themes_output)
print(synthesis_result)

In [None]:
# Save the results into a new file with added "_finished" to file name
synthesis_task.synthesis_result = synthesis_result
with open("synthesis_task_finished.pkl", "wb") as file:
    pickle.dump(synthesis_task, file)

Optional: The synthesis result along with the codes and themes of individual documents can now be viewed by running this script from this point onward, without having to run from the beginning of this .ipynb file

In [None]:
import pickle
from dokument import Dokument
from synthesis import Synthesis

# load your Synthesis object with existing .topic and .eligible_documents attribute
with open("synthesis_task_finished.pkl", "rb") as file:
    synthesis_task_result = pickle.load(file)
print("Research topic is:", synthesis_task_result.topic)

In [None]:
# index of document to view codes from
index = 1
print("Codes of Document", index, "with DOI", synthesis_task_result.eligible_documents[index].DOI, ":")
print(synthesis_task_result.eligible_documents[index].codes)

In [None]:
# index of document to view themes from
index = 0
print("Themes of Document", index, "with DOI", synthesis_task_result.eligible_documents[index].DOI, ":")
print(synthesis_task_result.eligible_documents[index].themes)

In [None]:
print("Synthesis result from processing", len(synthesis_task_result.eligible_documents), "documents:")
print(synthesis_task_result.synthesis_result)