In [None]:
import os
os.environ["OPENAI_API_KEY"]="Your_GPT_key_here"

In [None]:
from dokument import Dokument
from synthesis import Synthesis

import pickle
# load your Synthesis object with existing .topic and .eligible_documents attributes
with open("synthesis_task.pkl", "rb") as file:
    synthesis_task = pickle.load(file)
print("Research topic to be synthesized: ", synthesis_task.topic)

In [None]:
# Define the template for the coding of thematic analysis of the research topic
code_template = ("""
Consider the research topic:
{topic}

And the text input from a publication:
Text inputs:
{docs}

Please perform the following tasks of the coding step for a thematic analysis of the research topic:

Identify and Assign Codes:
For each segment of text that is relevant to the research topic, identify and assign a code.
Provide Code Details:
For each code, output the following information:

Code Name: A concise, descriptive name for the code.
Code Definition: A brief explanation of what the code represents.
Example: A quote or excerpt that exemplifies the code name and definition.

Example Output 1:
**Code Name**: Product Quality
**Code Definition**: Describes the perceived value and performance of a product.
**Example**: Many reviews highlighted the excellent build quality and durability of the product.

Example Output 2:
**Code Name**: User Feedback
**Code Definition**: "Encompasses comments and suggestions provided by users to improve the product or service.
**Example**: "Users suggested adding more customization options to enhance their experience.
""")

# Format the template with the specific research topic and placeholder string for document text
code_template_formatted = code_template.format(topic=synthesis_task.topic, docs="{docs}")

# Create a Prompt Template using the formatted code template
from langchain.prompts import ChatPromptTemplate
codes_prompt = ChatPromptTemplate.from_template(code_template_formatted)

# Define the language model with 0 temperature
from langchain_openai import ChatOpenAI
gpt4_model = ChatOpenAI(temperature=0, model_name="gpt-4-0125-preview")

# Define output parser to handle output
from langchain.schema.output_parser import StrOutputParser
str_output_parser = StrOutputParser()

# Final chain for determining eligibility
codes_chain = codes_prompt | gpt4_model | str_output_parser

In [None]:
# Define an asynchronous function to process and assign codes using list of eligible documents input
async def codes_from_raw_data(dokument_list):
    # Create list of raw data from the document list for input to the chain
    raw_data_list = [doc.raw_data for doc in dokument_list]
    
    # Run the codes chain on the input
    codes_list = await codes_chain.abatch(raw_data_list)
    
    # Assign the results to each document and print the codes
    for i in range(len(dokument_list)):
        # Save the coded data to the documents in the list
        dokument_list[i].codes = codes_list[i]
        print(f"Document {i+1}:")
        print("Codes:", dokument_list[i].codes)
        print("-" * 30)  # Separator

# Usage: Run the function on the list of eligible documents
await codes_from_raw_data(synthesis_task.eligible_documents)

In [None]:
# Prepare a string variable to be used as input to the determine themes
full_codes_output = ""

# Iterate over each eligible document to append its codes to the output variable
for i in range(len(synthesis_task.eligible_documents)):
    full_codes_output += f"Document {i+1}:\n"
    full_codes_output += f"Codes: {synthesis_task.eligible_documents[i].codes}\n"
    full_codes_output += "-" * 30 + "\n"  # Add a separator for better readability

# Count number of tokens in the accumulated codes output
num_tokens = gpt4_model.get_num_tokens(full_codes_output)

# Check if the number of tokens exceeds the model's limit and print debug message
if num_tokens > 128000:
    print("The output is too long for this model.")
else:
    print(full_codes_output)

In [None]:
# Define the template for identifying themes from coded data that are relevant to the research topic
themes_template = ("""
You will receive input of all coded data from a list of research documents, which contains coded 
segments of text, and a research topic. For the process of a thematic analysis of the research topic,
identify overarching themes that encapsulate common patterns present across the coded segments related to
the research topic, properly describe the theme as a synthesis of the data that formed the theme and
keep tract of the source of the theme

Refer to the examples to form your answers
Example research topic: How do customers feel about various aspects of our restaurant chain?

Example output 1:
**Theme 1**: Customer has mixed feelings about the quality of the product
**Description**: Product quality is unclear, people seem to have mixed opinion which is something we really
should prioritize improving
**Source**: Document 1,5,6,12,15,30....

Example output 2:
**Theme 2**: Price is good, service is bad
**Description**
**Source**: Document 2,5,10,25,30...

** Example ends here ** 
You will now receive the researc topic and the coded data input:
Consider the research topic: {topic}
Coded data input:
{docs}


""")

# Format the template with the specified research topic and a placeholder string for the coded data input
themes_template_formatted = themes_template.format(topic=synthesis_task.topic, docs="{docs}")

# Create a Prompt Template using the formatted template
themes_prompt = ChatPromptTemplate.from_template(themes_template_formatted)

# Final chain for identifying themes of the documents
themes_chain = themes_prompt | gpt4_model | str_output_parser

In [None]:
# Invoke chain to synthesize a list of themes based on the codes
synthesis_task.themes = themes_chain.invoke(full_codes_output)
print(synthesis_task.themes)

In [None]:
# Save the results into a new file with added "_finished" to file name
with open("synthesis_task_finished.pkl", "wb") as file:
    pickle.dump(synthesis_task, file)

Optional: The codes of individual documents and themes list can now be viewed by running this script from this point onward, without having to run from the beginning of this .ipynb file

In [None]:
import pickle
from dokument import Dokument
from synthesis import Synthesis

# load your Synthesis object with existing .topic and .eligible_documents attribute
with open("synthesis_task_finished.pkl", "rb") as file:
    synthesis_task_result = pickle.load(file)
print("Research topic is:", synthesis_task_result.topic)

In [None]:
# index of document to view codes from
index = 20
print("Codes of Document", index, "with DOI", synthesis_task_result.eligible_documents[index].DOI, ":")
print(synthesis_task_result.eligible_documents[index].codes)

In [None]:
print("Themes of Documents", index, "with DOI", synthesis_task_result.eligible_documents[index].DOI, ":")
print(synthesis_task_result.themes)