In [None]:
pip install faiss-cpu

In [None]:
pip install pypdf

In [None]:
pip install dotenv

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/"Master semester project"

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
import faiss
import pypdf
import json
import os
import numpy as np
import matplotlib.pyplot as plt

from openai import OpenAI
from dotenv import load_dotenv
from huggingface_hub import HfApi, hf_hub_download
from io import BytesIO
from utils import ChunkLoader, RAGEncoder, retrieve_relevant_chunks, get_all_chunks, show_chunks_statistics, get_index_and_chunks, get_formatted_question_gpt_3_5_Turbo_gpt_4o, get_formatted_instruction_gpt_3_5_Turbo_gpt_4o, get_formatted_prompt_gpt_2, generate_answer
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, CrossEncoder

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on: ", device)

load_dotenv()  # Loads environment variables from a .env file
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
HUGGING_FACE_TOKEN = os.getenv("HUGGING_FACE_TOKEN")

In [None]:
MAX_CHUNK_SIZE = 256
OVERLAP = int(0.2*MAX_CHUNK_SIZE)

# # Setting the LLM used for chunk embedding
# chunk_encoder_tokenizer = AutoTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
# chunk_encoder_model = AutoModel.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
# chunk_encoder = RAGEncoder(chunk_encoder_tokenizer, chunk_encoder_model, device)

# # Setting the LLM used for question embedding
# question_encoder_tokenizer = AutoTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
# question_encoder_model = AutoModel.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
# question_encoder = RAGEncoder(question_encoder_tokenizer, question_encoder_model, device)

rag_encoder_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# rag_encoder_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
# rag_encoder_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
rag_encoder = RAGEncoder(rag_encoder_model.tokenizer, rag_encoder_model, device)
chunk_encoder = rag_encoder
question_encoder = rag_encoder

# RAG preparation: chunking, embedding and uploading to HF hub

In [None]:
# folder path containing subfolders which contain the rag documents (the parent folder is divided into subfolders with different timespans)
PARENT_FOLDER = "./RAG documents"

def rag_chunk_embed_and_upload(parent_folder, rag_encoder, max_chunk_size, overlap, prefix, hugging_face_write_token, show_statistics):
  api = HfApi(token=hugging_face_write_token)

  for sub_folder in tqdm(os.listdir(parent_folder), desc="reading parent folder " + parent_folder + " ..."):
    #chunking the documents of the current sub_folder
    all_chunks = get_all_chunks(parent_folder + "/" + sub_folder, rag_encoder.tokenizer, max_chunk_size, overlap)

    if show_statistics:
      show_chunks_statistics(all_chunks, rag_encoder.tokenizer)

    # transform the chunks list into a jsonl file (we use BytesIO to avoid having to store the file locally)
    all_chunks_jsonl_content = "\n".join([json.dumps({"chunk": chunk}) for chunk in all_chunks])
    all_chunks_jsonl_file = BytesIO(all_chunks_jsonl_content.encode("utf-8"))

    # upload the jsonl file
    api.upload_file(
        path_or_fileobj=all_chunks_jsonl_file,
        path_in_repo=prefix + "_chunks_" + sub_folder + ".jsonl",
        repo_id="ziedM/rag_dataset",
        repo_type="dataset",
    )

    # vectorize the chunks (again we use BytesIO for the same reason as above)
    all_vectors_npy_file = BytesIO()
    all_vectors_npy_content = np.vstack([rag_encoder.encode_text(chunk) for chunk in tqdm(all_chunks, desc="vectorizing the chunks ...")])
    np.save(all_vectors_npy_file, all_vectors_npy_content)
    all_vectors_npy_file.seek(0)  # set the offset back to the beginning of the stream

    # upload the embeddings/vectors (we choose numpy as we're using faiss vector DB for similarity search later in the RAG pipeline)
    api.upload_file(
        path_or_fileobj=all_vectors_npy_file,
        path_in_repo=prefix + "_embeddings_" + sub_folder + ".npy",
        repo_id="ziedM/rag_dataset",
        repo_type="dataset",
    )

rag_chunk_embed_and_upload(PARENT_FOLDER, chunk_encoder, MAX_CHUNK_SIZE, OVERLAP, "sentence_transformer", HUGGING_FACE_TOKEN, True)

# Retrieval Augmented Generation Setup

In [None]:
# Setting the LLMs used for generation (we are using 3 LLMs GPT2, GPT-3.5 Turbo and GPT-4o)

# GPT2 from HF: (knowledge cutoff: November 2019)
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-xl")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2-xl").to(device)

# This OpenAI client is used to chat with GPT-3.5 Turbo (knowledge cutoff: September 2021) and GPT-4o
# (knowledge cutoff: October 2023) through OpenAI's platform (as these models are not available publicly)
client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
# instruction 1.1 is used to get the prediction and instruction 1.2 is used to get the assesment
TOP_K_FIRST_MODEL = 3
TOP_K_SECOND_MODEL = 5
TOP_K_THIRD_MODEL = 7
RESULT_DIR = "result_sentence_transformer" # or result_facebook_dpr

role = "You are an expert in deep technologies."
# Used for No-RAG
instruction_1_0 = "Answer the question below."
instruction_1_1 = "Answer the question below. Keep your response concise (maximum 15 lines)."
# Used for critical thinking
# instruction_1_2 = """Based on your current knowledge, provide a short analysis of how accurate
# the following prediction about deep technologies in {} is, citing
# relevant developments if possible. Specifically, assess whether the
# technologies mentioned gained significant adoption or traction, were
# delayed, overestimated, or abandoned.

# Prediction: {}

# Assessment: """
instruction_1_2 = """Based on your current knowledge, assess whether the
technologies mentioned in the following prediction are ranked correctly:
from most to least likely to become emergent in the year {}.

Prediction: {}

Assessment: """
# Used for RAG
instruction_2_0 = "Using the provided context, answer the question below."
instruction_2_1 = "Using the provided context, answer the question below. Keep your response concise (maximum 15 lines)."

# question_gpt_2 = """What are the top 7 deep technologies that are likely to be the
# most impactful in 2021? Rank from most to least likely to become impactful."""
question_gpt_2 = """Rank the following 7 deep technologies in descending order of emergence:
from most to least likely to become impactful in 2021. The 7 deep technologies are:
AI, Quantum Computing, AR & VR, Robotics, Biotech, New Materials, Electronics & Photonics."""
# question_gpt_3_5_turbo_gpt_4o = """What are the top 7 deep technologies that are likely to be the
# most impactful in {}? Rank from most to least likely to become impactful.
# For each: name the tech and briefly state why it's emerging."""
question_gpt_3_5_turbo_gpt_4o = """Rank the following 7 deep technologies in descending order of emergence:
from most to least likely to become impactful in {}. The 7 deep technologies are:
AI, Quantum Computing, AR & VR, Robotics, Biotech, New Materials, Electronics & Photonics.
For each: briefly justify your chosen ranking."""
question_gpt_3_5_turbo = question_gpt_3_5_turbo_gpt_4o.format(2023)
question_gpt_4o = question_gpt_3_5_turbo_gpt_4o.format(2025)

# Getting GPT-2 RAG contexts and predictions
### Getting GPT-2 predictions using the HF model is slow on a CPU. Thus, we make use of colab's T4 GPU for faster generations. The obtained predictions are saved to a json file (gpt-2-predictions.json).
### However, to obtain the contexts (when doing RAG), we have to use faiss-cpu as colab doesn't support faiss-gpu. Thus, the contexts were obtained first with the CPU and the predictions were then obtained using the GPU. The contexts are saved to a json file (gpt-2-contexts.json).This is why this step is done separately from the pipelines in ideas 0 to 4.

In [None]:
# context for rag without reranking
# index, chunks = get_index_and_chunks("sentence_transformer_chunks_past_2020.jsonl", "sentence_transformer_embeddings_past_2020.npy")
# relevant_chunks = retrieve_relevant_chunks(question_gpt_2, index, chunks, question_encoder, topk=TOP_K_FIRST_MODEL, normalize=True)
# context = "\n".join(relevant_chunks)

# context for rag with reranking
cross_encoder_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L6-v2')
index, chunks = get_index_and_chunks("sentence_transformer_chunks_past_2020.jsonl", "sentence_transformer_embeddings_past_2020.npy")
relevant_chunks = retrieve_relevant_chunks(question_gpt_2, index, chunks, question_encoder, topk=25, normalize=True)
scores = cross_encoder_model.predict([(question_gpt_2, chunk) for chunk in relevant_chunks])
# rerank the chunks: sort the chunks in descending order accroding to their scores
context = "\n".join(list(np.array(relevant_chunks)[np.flip(np.argsort(scores))])[:TOP_K_FIRST_MODEL])

In [None]:
print(context)

In [None]:
context_gpt_2_2019_2020 = context

In [None]:
context_gpt_2_past_2020 = context

In [None]:
context_gpt_2_2019_2020_reranked = context

In [None]:
context_gpt_2_past_2020_reranked = context

In [None]:
contexts = {
    "context_gpt_2_2019_2020": context_gpt_2_2019_2020,
    "context_gpt_2_past_2020": context_gpt_2_past_2020,
    "context_gpt_2_2019_2020_reranked": context_gpt_2_2019_2020_reranked,
    "context_gpt_2_past_2020_reranked": context_gpt_2_past_2020_reranked
}

with open("gpt-2-contexts-subapproach2.json", "w") as f:
  json.dump(contexts, f)

In [None]:
with open("gpt-2-contexts-subapproach2.json", "r") as f:
    context_data = json.load(f)

context = context_data["context_gpt_2_past_2020_reranked"]

In [None]:
# if no rag
# prompt_1 = get_formatted_prompt_gpt_2(role + " " + instruction_1_0, context=None, question=question_gpt_2, RAG=False)
# prediction_1 = generate_answer(prompt_1, tokenizer, model, device)
# prediction_1 = prediction_1[prediction_1.index("Answer: ")+8:].strip(' ')

# if rag
prompt_1 = get_formatted_prompt_gpt_2(role + " " + instruction_2_0, context, question_gpt_2, RAG=True)
prediction_1 = generate_answer(prompt_1, tokenizer, model, device)
prediction_1 = prediction_1[prediction_1.index("Answer: ")+8:].strip(' ')

In [None]:
print(prediction_1)

In [None]:
prediction_1_gpt_2_no_rag = prediction_1

In [None]:
prediction_1_gpt_2_rag_2019_2020 = prediction_1

In [None]:
prediction_1_gpt_2_rag_past_2020 = prediction_1

In [None]:
prediction_1_gpt_2_rag_2019_2020_reranked = prediction_1

In [None]:
prediction_1_gpt_2_rag_past_2020_reranked = prediction_1

In [None]:
predictions = {
    "prediction_1_gpt_2_no_rag": prediction_1_gpt_2_no_rag,
    "prediction_1_gpt_2_rag_2019_2020": prediction_1_gpt_2_rag_2019_2020,
    "prediction_1_gpt_2_rag_past_2020": prediction_1_gpt_2_rag_past_2020,
    "prediction_1_gpt_2_rag_2019_2020_reranked": prediction_1_gpt_2_rag_2019_2020_reranked,
    "prediction_1_gpt_2_rag_past_2020_reranked": prediction_1_gpt_2_rag_past_2020_reranked
}

with open("gpt-2-predictions-subapproach2.json", "w") as f:
  json.dump(predictions, f)

# Idea 0

In [None]:
# prompt 1 is for gpt-2 to get prediction 1.
# prompt 2 is for gpt-3.5 turbo to get assesment 1.
# prompt 3 is for gpt-3.5 turbo to get prediction 2
# prompt 4 is for gpt-4o to get assesment 2.
# prompt 5 is for gpt-4o to get prediction 3

with open("gpt-2-predictions-subapproach1.json", "r") as f:
    gpt_2_predictions = json.load(f)

prediction_1 = gpt_2_predictions["prediction_1_gpt_2_no_rag"]


prompt_2 = instruction_1_2.format(2021, prediction_1)
assessment_1 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o(role, instruction_1_1, None, False)},
  {"role": "user", "content": prompt_2}
]
).choices[0].message.content

prompt_3 = get_formatted_question_gpt_3_5_Turbo_gpt_4o(question_gpt_3_5_turbo)
prediction_2 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o(role, instruction_1_1, None, False)},
  {"role": "user", "content": prompt_2},
  {"role": "assistant", "content": assessment_1},
  {"role": "system", "content": instruction_1_1 + "\n\n"},
  {"role": "user", "content": prompt_3}
]
).choices[0].message.content


prompt_4 = instruction_1_2.format(2023, prediction_2)
assessment_2 = client.chat.completions.create(
    model="gpt-4o",
    messages=[
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o(role, instruction_1_1, None, False)},
  {"role": "user", "content": prompt_4}
]
).choices[0].message.content

prompt_5 = get_formatted_question_gpt_3_5_Turbo_gpt_4o(question_gpt_4o)
prediction_3 = client.chat.completions.create(
    model="gpt-4o",
    messages=[
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o(role, instruction_1_1, None, False)},
  {"role": "user", "content": prompt_4},
  {"role": "assistant", "content": assessment_2},
  {"role": "system", "content": instruction_1_1 + "\n\n"},
  {"role": "user", "content": prompt_5}
]
).choices[0].message.content



result = {
    "prediction_1": prediction_1,
    "assessment_1": assessment_1,
    "prediction_2": prediction_2,
    "assessment_2": assessment_2,
    "prediction_3": prediction_3
}

with open(RESULT_DIR+"/"+"results_idea0_subapproach1.json", "w") as f:
  json.dump(result, f)

# Idea 1


In [None]:
# prompt 1 is for gpt-2 to get prediction 1.
# prompt 2 is for gpt-3.5 turbo to get assesment 1.
# prompt 3 is for gpt-3.5 turbo to get prediction 2
# prompt 4 is for gpt-4o to get assesment 2.
# prompt 5 is for gpt-4o to get prediction 3

with open("gpt-2-predictions-subapproach1.json", "r") as f:
    gpt_2_predictions = json.load(f)

prediction_1 = gpt_2_predictions["prediction_1_gpt_2_rag_2019_2020"]


prompt_2 = instruction_1_2.format(2021, prediction_1)
assessment_1 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o(role, instruction_1_1, None, False)},
  {"role": "user", "content": prompt_2}
]
).choices[0].message.content


index, chunks = get_index_and_chunks("sentence_transformer_chunks_2021_2022.jsonl", "sentence_transformer_embeddings_2021_2022.npy")
relevant_chunks = retrieve_relevant_chunks(question_gpt_3_5_turbo, index, chunks, question_encoder, topk=TOP_K_SECOND_MODEL, normalize=True)
context = "\n".join(relevant_chunks)
prompt_3 = get_formatted_question_gpt_3_5_Turbo_gpt_4o(question_gpt_3_5_turbo)
prediction_2 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o(role, instruction_1_1, None, False)},
  {"role": "user", "content": prompt_2},
  {"role": "assistant", "content": assessment_1},
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o("", instruction_2_1, context, True)},
  {"role": "user", "content": prompt_3}
]
).choices[0].message.content

prompt_4 = instruction_1_2.format(2023, prediction_2)
assessment_2 = client.chat.completions.create(
    model="gpt-4o",
    messages=[
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o(role, instruction_1_1, None, False)},
  {"role": "user", "content": prompt_4}
]
).choices[0].message.content

index, chunks = get_index_and_chunks("sentence_transformer_chunks_2023_2024.jsonl", "sentence_transformer_embeddings_2023_2024.npy")
relevant_chunks = retrieve_relevant_chunks(question_gpt_4o, index, chunks, question_encoder, topk=TOP_K_THIRD_MODEL, normalize=True)
context = "\n".join(relevant_chunks)
prompt_5 = get_formatted_question_gpt_3_5_Turbo_gpt_4o(question_gpt_4o)
prediction_3 = client.chat.completions.create(
    model="gpt-4o",
    messages=[
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o(role, instruction_1_1, None, False)},
  {"role": "user", "content": prompt_4},
  {"role": "assistant", "content": assessment_2},
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o("", instruction_2_1, context, True)},
  {"role": "user", "content": prompt_5}
]
).choices[0].message.content


result = {
    "prediction_1": prediction_1,
    "assessment_1": assessment_1,
    "prediction_2": prediction_2,
    "assessment_2": assessment_2,
    "prediction_3": prediction_3
}

with open(RESULT_DIR+"/"+"results_idea1_subapproach1.json", "w") as f:
  json.dump(result, f)

# Idea 2

In [None]:
# prompt 1 is for gpt-2 to get prediction 1.
# prompt 2 is for gpt-3.5 turbo to get assesment 1.
# prompt 3 is for gpt-3.5 turbo to get prediction 2
# prompt 4 is for gpt-4o to get assesment 2.
# prompt 5 is for gpt-4o to get prediction 3

with open("gpt-2-predictions-subapproach1.json", "r") as f:
    gpt_2_predictions = json.load(f)

prediction_1 = gpt_2_predictions["prediction_1_gpt_2_rag_past_2020"]

prompt_2 = instruction_1_2.format(2021, prediction_1)
assessment_1 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o(role, instruction_1_1, None, False)},
  {"role": "user", "content": prompt_2}
]
).choices[0].message.content


index, chunks = get_index_and_chunks("sentence_transformer_chunks_past_2022.jsonl", "sentence_transformer_embeddings_past_2022.npy")
relevant_chunks = retrieve_relevant_chunks(question_gpt_3_5_turbo, index, chunks, question_encoder, topk=TOP_K_SECOND_MODEL, normalize=True)
context = "\n".join(relevant_chunks)
prompt_3 = get_formatted_question_gpt_3_5_Turbo_gpt_4o(question_gpt_3_5_turbo)
prediction_2 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o(role, instruction_1_1, None, False)},
  {"role": "user", "content": prompt_2},
  {"role": "assistant", "content": assessment_1},
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o("", instruction_2_1, context, True)},
  {"role": "user", "content": prompt_3}
]
).choices[0].message.content

prompt_4 = instruction_1_2.format(2023, prediction_2)
assessment_2 = client.chat.completions.create(
    model="gpt-4o",
    messages=[
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o(role, instruction_1_1, None, False)},
  {"role": "user", "content": prompt_4}
]
).choices[0].message.content

index, chunks = get_index_and_chunks("sentence_transformer_chunks_past_2024.jsonl", "sentence_transformer_embeddings_past_2024.npy")
relevant_chunks = retrieve_relevant_chunks(question_gpt_4o, index, chunks, question_encoder, topk=TOP_K_THIRD_MODEL, normalize=True)
context = "\n".join(relevant_chunks)
prompt_5 = get_formatted_question_gpt_3_5_Turbo_gpt_4o(question_gpt_4o)
prediction_3 = client.chat.completions.create(
    model="gpt-4o",
    messages=[
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o(role, instruction_1_1, None, False)},
  {"role": "user", "content": prompt_4},
  {"role": "assistant", "content": assessment_2},
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o("", instruction_2_1, context, True)},
  {"role": "user", "content": prompt_5}
]
).choices[0].message.content


result = {
    "prediction_1": prediction_1,
    "assessment_1": assessment_1,
    "prediction_2": prediction_2,
    "assessment_2": assessment_2,
    "prediction_3": prediction_3
}

with open(RESULT_DIR+"/"+"results_idea2_subapproach1.json", "w") as f:
  json.dump(result, f)

# Idea 3

In [None]:
# prompt 1 is for gpt-2 to get prediction 1.
# prompt 2 is for gpt-3.5 turbo to get assesment 1.
# prompt 3 is for gpt-3.5 turbo to get prediction 2
# prompt 4 is for gpt-4o to get assesment 2.
# prompt 5 is for gpt-4o to get prediction 3

with open("gpt-2-predictions-subapproach1.json", "r") as f:
    gpt_2_predictions = json.load(f)

prediction_1 = gpt_2_predictions["prediction_1_gpt_2_rag_2019_2020_reranked"]

prompt_2 = instruction_1_2.format(2021, prediction_1)
assessment_1 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o(role, instruction_1_1, None, False)},
  {"role": "user", "content": prompt_2}
]
).choices[0].message.content


cross_encoder_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L6-v2')
index, chunks = get_index_and_chunks("sentence_transformer_chunks_2021_2022.jsonl", "sentence_transformer_embeddings_2021_2022.npy")
relevant_chunks = retrieve_relevant_chunks(question_gpt_3_5_turbo, index, chunks, question_encoder, topk=25, normalize=True)
scores = cross_encoder_model.predict([(question_gpt_3_5_turbo, chunk) for chunk in relevant_chunks])
# rerank the chunks: sort the chunks in descending order accroding to their scores
context = "\n".join(list(np.array(relevant_chunks)[np.flip(np.argsort(scores))])[:TOP_K_SECOND_MODEL])
prompt_3 = get_formatted_question_gpt_3_5_Turbo_gpt_4o(question_gpt_3_5_turbo)
prediction_2 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o(role, instruction_1_1, None, False)},
  {"role": "user", "content": prompt_2},
  {"role": "assistant", "content": assessment_1},
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o("", instruction_2_1, context, True)},
  {"role": "user", "content": prompt_3}
]
).choices[0].message.content

prompt_4 = instruction_1_2.format(2023, prediction_2)
assessment_2 = client.chat.completions.create(
    model="gpt-4o",
    messages=[
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o(role, instruction_1_1, None, False)},
  {"role": "user", "content": prompt_4}
]
).choices[0].message.content


cross_encoder_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L6-v2')
index, chunks = get_index_and_chunks("sentence_transformer_chunks_2023_2024.jsonl", "sentence_transformer_embeddings_2023_2024.npy")
relevant_chunks = retrieve_relevant_chunks(question_gpt_4o, index, chunks, question_encoder, topk=25, normalize=True)
scores = cross_encoder_model.predict([(question_gpt_4o, chunk) for chunk in relevant_chunks])
# rerank the chunks: sort the chunks in descending order accroding to their scores
context = "\n".join(list(np.array(relevant_chunks)[np.flip(np.argsort(scores))])[:TOP_K_THIRD_MODEL])
prompt_5 = get_formatted_question_gpt_3_5_Turbo_gpt_4o(question_gpt_4o)
prediction_3 = client.chat.completions.create(
    model="gpt-4o",
    messages=[
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o(role, instruction_1_1, None, False)},
  {"role": "user", "content": prompt_4},
  {"role": "assistant", "content": assessment_2},
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o("", instruction_2_1, context, True)},
  {"role": "user", "content": prompt_5}
]
).choices[0].message.content


result = {
    "prediction_1": prediction_1,
    "assessment_1": assessment_1,
    "prediction_2": prediction_2,
    "assessment_2": assessment_2,
    "prediction_3": prediction_3
}

with open(RESULT_DIR+"/"+"results_idea3_subapproach1.json", "w") as f:
  json.dump(result, f)

# Idea 4

In [None]:
# prompt 1 is for gpt-2 to get prediction 1.
# prompt 2 is for gpt-3.5 turbo to get assesment 1.
# prompt 3 is for gpt-3.5 turbo to get prediction 2
# prompt 4 is for gpt-4o to get assesment 2.
# prompt 5 is for gpt-4o to get prediction 3

with open("gpt-2-predictions-subapproach1.json", "r") as f:
    gpt_2_predictions = json.load(f)

prediction_1 = gpt_2_predictions["prediction_1_gpt_2_rag_past_2020_reranked"]

prompt_2 = instruction_1_2.format(2021, prediction_1)
assessment_1 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o(role, instruction_1_1, None, False)},
  {"role": "user", "content": prompt_2}
]
).choices[0].message.content


cross_encoder_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L6-v2')
index, chunks = get_index_and_chunks("sentence_transformer_chunks_past_2022.jsonl", "sentence_transformer_embeddings_past_2022.npy")
relevant_chunks = retrieve_relevant_chunks(question_gpt_3_5_turbo, index, chunks, question_encoder, topk=25, normalize=True)
scores = cross_encoder_model.predict([(question_gpt_3_5_turbo, chunk) for chunk in relevant_chunks])
# rerank the chunks: sort the chunks in descending order accroding to their scores
context = "\n".join(list(np.array(relevant_chunks)[np.flip(np.argsort(scores))])[:TOP_K_SECOND_MODEL])
prompt_3 = get_formatted_question_gpt_3_5_Turbo_gpt_4o(question_gpt_3_5_turbo)
prediction_2 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o(role, instruction_1_1, None, False)},
  {"role": "user", "content": prompt_2},
  {"role": "assistant", "content": assessment_1},
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o("", instruction_2_1, context, True)},
  {"role": "user", "content": prompt_3}
]
).choices[0].message.content

prompt_4 = instruction_1_2.format(2023, prediction_2)
assessment_2 = client.chat.completions.create(
    model="gpt-4o",
    messages=[
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o(role, instruction_1_1, None, False)},
  {"role": "user", "content": prompt_4}
]
).choices[0].message.content


cross_encoder_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L6-v2')
index, chunks = get_index_and_chunks("sentence_transformer_chunks_past_2024.jsonl", "sentence_transformer_embeddings_past_2024.npy")
relevant_chunks = retrieve_relevant_chunks(question_gpt_4o, index, chunks, question_encoder, topk=25, normalize=True)
scores = cross_encoder_model.predict([(question_gpt_4o, chunk) for chunk in relevant_chunks])
# rerank the chunks: sort the chunks in descending order accroding to their scores
context = "\n".join(list(np.array(relevant_chunks)[np.flip(np.argsort(scores))])[:TOP_K_THIRD_MODEL])
prompt_5 = get_formatted_question_gpt_3_5_Turbo_gpt_4o(question_gpt_4o)
prediction_3 = client.chat.completions.create(
    model="gpt-4o",
    messages=[
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o(role, instruction_1_1, None, False)},
  {"role": "user", "content": prompt_4},
  {"role": "assistant", "content": assessment_2},
  {"role": "system", "content": get_formatted_instruction_gpt_3_5_Turbo_gpt_4o("", instruction_2_1, context, True)},
  {"role": "user", "content": prompt_5}
]
).choices[0].message.content


result = {
    "prediction_1": prediction_1,
    "assessment_1": assessment_1,
    "prediction_2": prediction_2,
    "assessment_2": assessment_2,
    "prediction_3": prediction_3
}

with open(RESULT_DIR+"/"+"results_idea4_subapproach1.json", "w") as f:
  json.dump(result, f)

# TF-IDF

In [None]:
with open(RESULT_DIR+"/"+"results_idea4_subapproach1.json", "r") as f:
    predictions = json.load(f)

print(f"Prediction of GPT-2:\n {predictions['prediction_1']} \n")
print(f"Prediction of GPT-3.5 Turbo:\n {predictions['prediction_2']} \n")
print(f"Prediction of GPT-4o:\n {predictions['prediction_3']}")

In [None]:
idea_4_gpt_2_predictions_processed_subapproach1 = ["AI", "Quantum Computing", "AR & VR", "Robotics", "Biotech", "New Materials", "Electronics & Photonics"]
idea_4_gpt_3_5_Turbo_predictions_processed_subapproach1 = ["AI", "Robotics", "Biotech", "Quantum Computing", "AR & VR", "Electronics & Photonics", "New Materials"]
idea_4_gpt_4o_predictions_processed_subapproach1 = ["AI", "Biotech", "Robotics", "Quantum Computing", "AR & VR", "Electronics & Photonics", "New Materials"]

idea_3_gpt_2_predictions_processed_subapproach1 = ["AI", "Quantum Computing", "AR & VR", "Robotics", "Biotech", "New Materials", "Electronics & Photonics"]
idea_3_gpt_3_5_Turbo_predictions_processed_subapproach1 = ["AI", "Robotics", "Quantum Computing", "Biotech", "AR & VR", "New Materials", "Electronics & Photonics"]
idea_3_gpt_4o_predictions_processed_subapproach1 = ["AI", "Biotech", "Quantum Computing", "Robotics", "AR & VR", "Electronics & Photonics", "New Materials"]

idea_2_gpt_2_predictions_processed_subapproach1 = ["AI", "Quantum Computing", "AR & VR", "Robotics", "Biotech", "New Materials", "Electronics & Photonics"]
idea_2_gpt_3_5_Turbo_predictions_processed_subapproach1 = ["AI", "Quantum Computing", "Robotics", "Biotech", "AR & VR", "New Materials", "Electronics & Photonics"]
idea_2_gpt_4o_predictions_processed_subapproach1 = ["AI", "Robotics", "Biotech", "AR & VR", "Quantum Computing", "New Materials", "Electronics & Photonics"]

idea_1_gpt_2_predictions_processed_subapproach1 = ["AI", "Quantum Computing", "AR & VR", "Robotics", "Biotech", "New Materials", "Electronics & Photonics"]
idea_1_gpt_3_5_Turbo_predictions_processed_subapproach1 = ["AI", "Biotech", "Quantum Computing", "Robotics", "AR & VR", "Electronics & Photonics", "New Materials"]
idea_1_gpt_4o_predictions_processed_subapproach1 = ["AI", "Biotech", "Robotics", "Quantum Computing", "AR & VR", "Electronics & Photonics", "New Materials"]

idea_0_gpt_2_predictions_processed_subapproach1 = ["AI", "Quantum Computing", "AR & VR", "Robotics", "Biotech", "New Materials", "Electronics & Photonics"]
idea_0_gpt_3_5_Turbo_predictions_processed_subapproach1 = ["AI", "Quantum Computing", "AR & VR", "Biotech", "Robotics", "New Materials", "Electronics & Photonics"]
idea_0_gpt_4o_predictions_processed_subapproach1 = ["AI", "Biotech", "AR & VR", "Quantum Computing", "Robotics", "New Materials", "Electronics & Photonics"]

In [None]:
idea_4_gpt_2_predictions_subapproach2 = ["Artificial intelligence", "Deep learning", "Augmented reality", "Internet of Things", "Cloud computing", "Virtual reality", "Machine learning"]
idea_4_gpt_3_5_Turbo_predictions_subapproach2 = ["Human Augmentation", "Neural Interfaces", "Swarm Intelligence", "Citizen Data Science", "Autonomous Everything", "Biochips", "Self-Healing Systems"]
idea_4_gpt_4o_predictions_subapproach2 = ["Artificial Intelligence", "Quantum Computing", "5G and Advanced Connectivity", "Internet of Things", "Blockchain", "Augmented and Virtual Reality", "Biotechnology and Genomics"]

idea_3_gpt_2_predictions_subapproach2 = ["Deep learning", "Augmented reality", "Artificial intelligence", "Virtual reality", "Deep learning", "Augmented reality", "Artificial intelligence"]
idea_3_gpt_3_5_Turbo_predictions_subapproach2 = ["Artificial Intelligence", "Quantum Computing", "Internet of Things", "Augmented Reality", "Blockchain", "5G Technology", "Biotechnology"]
idea_3_gpt_4o_prediction_subapproach2 = ["Autonomous AI", "Pervasive Cloud", "Generative AI", "Human-Centric Security and Privacy", "5G Technology", "Quantum Computing", "IoT"]

idea_2_gpt_2_predictions_subapproach2 = ["Artificial Intelligence", "Deep Learning", "Quantum Computing", "Virtual reality", "Augmented Reality", "Internet of Things", "Machine Learning"]
idea_2_gpt_3_5_Turbo_predictions_subapproach2 = ["Extended Reality", "Artificial Intelligence", "Quantum Computing", "Internet of Things", "Blockchain", "Edge Computing", "Robotics and Automation"]
idea_2_gpt_4o_predictions_subapproach2 = ["Artificial Intelligence", "Internet of Things", "Quantum Computing", "Blockchain", "Edge Computing", "Extended Reality", "Robotics and Automation"]

idea_1_gpt_2_predictions_subapproach2 = ["3D Vision", "Artificial Intelligence", "Deep Learning", "Big Data", "Machine Learning", "Augmented Reality", "Blockchain"]
idea_1_gpt_3_5_Turbo_predictions_subapproach2 = ["Quantum Computing", "Edge AI", "Synthetic Biology", "Robotics Process Automation", "Explainable AI", "Federated Learning", "Neuromorphic Computing"]
idea_1_gpt_4o_predictions_subapproach2 = ["Autonomous AI", "Generative AI", "Edge AI", "Federated Learning", "Synthetic Biology", "Human-Centric Security and Privacy", "Robotic Process Automation"]

idea_0_gpt_2_predictions_subapproach2 = ["Artificial Intelligence", "Deep Learning", "Blockchain", "Quantum Computing", "3D Printing", "Virtual Reality", "Augmented Reality"]
idea_0_gpt_3_5_Turbo_predictions_subapproach2 = ["Artificial Intelligence", "Quantum Computing", "Internet of Things", "Edge Computing", "Robotics", "Augmented Reality", "Biotechnology"]
idea_0_gpt_4o_predictions_subapproach2 = ["Artificial Intelligence", "Quantum Computing", "Biotechnology", "Internet of Things", "Robotics", "Edge Computing", "Augmented Reality"]

In [None]:
idea_4_gpt_2_predictions_processed_subapproach2 = ["AI", "AR & VR", "IoT", "Cloud Computing"]
idea_4_gpt_3_5_Turbo_predictions_processed_subapproach2 = ["Biotech", "AI", "Robotics", "New Materials"]
idea_4_gpt_4o_predictions_processed_subapproach2 = ["AI", "Quantum Computing", "5G", "IoT", "Blockchain", "AR & VR", "Biotech"]

idea_3_gpt_2_predictions_processed_subapproach2 = ["AI", "AR & VR"]
idea_3_gpt_3_5_Turbo_predictions_processed_subapproach2 = ["AI", "Quantum Computing", "IoT", "AR & VR", "Blockchain", "5G", "Biotech"]
idea_3_gpt_4o_predictions_processed_subapproach2 = ["AI", "Cloud Computing", "Cybersecurity", "5G", "Quantum Computing", "IoT"]

idea_2_gpt_2_predictions_processed_subapproach2 = ["AI", "Quantum Computing", "AR & VR", "IoT"]
idea_2_gpt_3_5_Turbo_predictions_processed_subapproach2 = ["AR & VR", "AI", "Quantum Computing", "IoT", "Blockchain", "Edge Computing", "Robotics"]
idea_2_gpt_4o_predictions_processed_subapproach2 = ["AI", "IoT", "Quantum Computing", "Blockchain", "Edge Computing", "AR & VR", "Robotics"]

idea_1_gpt_2_predictions_processed_subapproach2 = ["AI", "Cloud Computing", "AR & VR", "Blockchain"]
idea_1_gpt_3_5_Turbo_predictions_processed_subapproach2 = ["Quantum Computing", "AI", "Biotech", "Robotics"]
idea_1_gpt_4o_predictions_processed_subapproach2 = ["AI", "Biotech", "Cybersecurity", "Robotics"]

idea_0_gpt_2_predictions_processed_subapproach2 = ["AI", "Blockchain", "Quantum Computing", "Additive Manufacturing", "AR & VR"]
idea_0_gpt_3_5_Turbo_predictions_processed_subapproach2 = ["AI", "Quantum Computing", "IoT", "Edge Computing", "Robotics", "AR & VR", "Biotech"]
idea_0_gpt_4o_predictions_processed_subapproach2 = ["AI", "Quantum Computing", "Biotech", "IoT", "Robotics", "Edge Computing", "AR & VR"]

In [None]:
unique_technologies = [
 'AI',
 'Quantum Computing',
 'AR & VR',
 'Robotics',
 'Biotech',
 'New Materials',
 'Electronics & Photonics',
 'Blockchain',
 'Cloud Computing',
 'Cybersecurity',
 'Edge Computing',
 'IoT',
 '5G',
 'Additive Manufacturing'
 ]

all_subterms = [["ai", "artificial intelligence", "machine learning", "deep learning", "genai", "natural language processing", "intelligent systems"],
         ["quantum", "qubit", "qubits"],
         ["ar", "vr", "xr", "mr", "augmented reality", "virtual reality", "extended reality", "mixed reality", "metaverse"],
         ["robotics", "robotic", "robots", "robot", "cobots", "cobot", "rpa"],
         ["biopharma", "biometrics", "biotechnology", "biotech", "bio", "biomolecules", "biosystems", "biomachines", "biocomputing", "omics", "genetics", "neuroscience", "life science", "genetic engineering", "genomics", "biology", "bioinformatics"],
         ["nanomaterials", "nanomaterial", "materials", "material", "metamaterials", "metamaterial", "alloy", "alloys", "superconductors", "superconductor", "conductive polymers", "conductive polymer", "photonic crystal", "photonic crystal", "biodegradable plastics", "biodegradable plastic"],
         ["electronics", "electronic", "photonics", "photonic", "integrated circuit", "transistor", "transistors", "microprocessor", "microprocessors", "microcontroller", "microcontrollers", "spectroscopy", "lidar", "led", "plasmonics", "holography", "optics", "optical"],
         ["blockchain", "distributed ledger", "decentralization", "cryptography", "smart contracts", "consensus mechanisms"],
         ["cloud", "infrastructure as a service", "iaas", "platform as a service", "paas", "software as a service", "saas", "serverless computing"],
         ["cybersecurity", "network security", "endpoint protection", "encryption", "firewall", "malware detection", "identity and access management", "iam", "zero trust", "phishing prevention", "data breach", "vulnerability management"],
         ["edge computing", "edge devices", "decentralized computing", "fog computing", "edge analytics", "micro data centers"],
         ["iot", "sensors", "remote monitoring"],
         ["5g", "massive mimo", "beamforming", "millimeter wave", "mmwave", "network slicing", "enhanced mobile broadband", "embb"],
         ['additive manufacturing', "3d printing", "stereolithography", "sla", "selective laser sintering", "sls", "fused deposition modeling", "fdm", "layer-by-layer fabrication", "lbl", "powder bed fusion", "pbf"]
]
technology_to_subterm = dict(zip(unique_technologies, all_subterms))

In [None]:
import nltk
import pandas as pd
import seaborn as sns
import math

from scipy.sparse import diags
from utils import read_pdf
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

nltk.download('stopwords')

def compute_tf_df_scores(validation_folder_path, terms, tf_normalize):
  corpus = []
  for file_name in tqdm(os.listdir(validation_folder_path), desc="reading validation files " + validation_folder_path + " ..."):
    file_content = read_pdf(validation_folder_path + "/" + file_name).lower()
    corpus.append(file_content)

  total_number_of_documents = len(corpus)
  vectorizer = TfidfVectorizer(ngram_range=(1, 4), stop_words=stopwords.words('english'), norm=None, use_idf=False)
  X = vectorizer.fit_transform(corpus)
  vocab = vectorizer.get_feature_names_out()

  if tf_normalize:
    inverse_number_of_unigrams_in_docs = [1/len([elem for elem in vocab[X[i].nonzero()[1]] if len(elem.split())==1]) for i in range(total_number_of_documents)]
    normalizer = diags(inverse_number_of_unigrams_in_docs)
    X = normalizer @ X

  number_of_documents_containing_each_term = [sum([1 if any(sub_term in vocab[X[i].nonzero()[1]] for sub_term in sub_terms) else 0 for i in range(total_number_of_documents)]) for sub_terms in terms]
  inverse_document_frequencies = [math.log10((total_number_of_documents+1)/(nb_of_docs+1))+1 for nb_of_docs in number_of_documents_containing_each_term]

  # compute the average frequency for each term (i.e. avg nb of occurences of a term in a doc: if tf_normalise is set to True the nb of occurences
  # will be normalized by the total number of unigrams in the document), defined as the sum of the number of times the term occured in each document
  # of the corpus divided by the number of documents in the corpus. The number of times a term occurs is defined as the sum of the number of times
  # the sub_terms corresponding to that term occur.
  term_frequencies = [np.sum([X.getcol(vectorizer.vocabulary_.get(sub_term)).toarray() for sub_term in sub_terms if sub_term in vocab])/number_of_documents if number_of_documents else 0 for sub_terms, number_of_documents in zip(terms, number_of_documents_containing_each_term)]
  tf_idf_scores = [term_frequencies[i]*inverse_document_frequencies[i] for i in range(len(term_frequencies))]

  return tf_idf_scores

fixed_technologies = ["AI", "Quantum Computing", "AR & VR", "Robotics", "Biotech", "New Materials", "Electronics & Photonics"]
terms = [technology_to_subterm[technology] for technology in fixed_technologies]

# technologies = idea_4_gpt_4o_predictions_processed_subapproach1
# terms = [technology_to_subterm[technology] for technology in technologies]
tf_normalize = True

# validation_folder_path_2019 = "./validation/2019"
validation_folder_path_2021 = "./validation/2021"
validation_folder_path_2023 = "./validation/2023"
validation_folder_path_2025 = "./validation/2025"

# tf_df_scores_2019 = compute_tf_df_scores(validation_folder_path_2019, terms, tf_normalize)
tf_df_scores_2021 = compute_tf_df_scores(validation_folder_path_2021, terms, tf_normalize)
tf_df_scores_2023 = compute_tf_df_scores(validation_folder_path_2023, terms, tf_normalize)
tf_df_scores_2025 = compute_tf_df_scores(validation_folder_path_2025, terms, tf_normalize)

# tf_df_scores_2019_no_zeros = np.array([elem if elem else np.mean(tf_df_scores_2019)/10 for elem in tf_df_scores_2019])
# tf_df_scores_2021_no_zeros = np.array([elem if elem else np.mean(tf_df_scores_2021)/10 for elem in tf_df_scores_2021])
# tf_df_scores_2023_no_zeros = np.array([elem if elem else np.mean(tf_df_scores_2023)/10 for elem in tf_df_scores_2023])
# tf_df_scores_2025_no_zeros = np.array([elem if elem else np.mean(tf_df_scores_2025)/10 for elem in tf_df_scores_2025])

# l1_normalized_tf_df_scores_2025 = np.array(tf_df_scores_2025)/np.sum(tf_df_scores_2025)

# years = [2025] * len(technologies)
years = [2021] * len(fixed_technologies) + [2023] * len(fixed_technologies) + [2025] * len(fixed_technologies)
tf_idf_scores = tf_df_scores_2021 + tf_df_scores_2023 + tf_df_scores_2025
# tf_idf_scores = l1_normalized_tf_df_scores_2025
tf_idf_df = pd.DataFrame({"Year": years, "Term": fixed_technologies*3, "TF-IDF": tf_idf_scores})
# tf_idf_df = pd.DataFrame({"Year": years, "Term": technologies, "TF-IDF": tf_idf_scores})

In [None]:
growth_rates_2019_2021 = (tf_df_scores_2021_no_zeros-tf_df_scores_2019_no_zeros)/tf_df_scores_2019_no_zeros
growth_rates_2021_2023 = (tf_df_scores_2023_no_zeros-tf_df_scores_2021_no_zeros)/tf_df_scores_2021_no_zeros
growth_rates_2023_2025 = (tf_df_scores_2025_no_zeros-tf_df_scores_2023_no_zeros)/tf_df_scores_2023_no_zeros

In [None]:
from matplotlib import colormaps

cmap = colormaps['tab20']
colors = cmap.colors[:len(unique_technologies)]

technology_to_color = dict(zip(unique_technologies, colors))
technologies_colors = [technology_to_color[technology] for technology in technologies]

sns.barplot(tf_idf_df, x="Year", y="TF-IDF", hue="Term", palette=technologies_colors)
plt.xticks(rotation=0, ha='center')
plt.ylim(0,1)
plt.tight_layout()
# plt.savefig("tf_idf_normalized_tf_xaxis_year.png")
plt.savefig("idea_4_gpt_4o_predictions_subapproach1.png")
plt.show()

In [None]:
ground_truth_ordering_string = np.array(technologies)[np.flip(np.argsort(tf_idf_scores))]
ground_truth_ordering_numerical = np.arange(1, len(technologies)+1)
ground_truth_ordering_dict = dict(zip(ground_truth_ordering_string, ground_truth_ordering_numerical))
predicted_ordering_numerical = np.array([ground_truth_ordering_dict[tech] for tech in technologies])

In [None]:
ground_truth_ordering_string_growth_rate = np.array(fixed_technologies)[np.flip(np.argsort(growth_rates_2023_2025))]
ground_truth_ordering_numerical_growth_rate = np.arange(1, len(fixed_technologies)+1)
ground_truth_ordering_dict_growth_rate = dict(zip(ground_truth_ordering_string_growth_rate, ground_truth_ordering_numerical_growth_rate))
predicted_ordering_numerical_growth_rate = np.array([ground_truth_ordering_dict_growth_rate[tech] for tech in technologies])

In [None]:
from scipy import stats

res = stats.kendalltau(ground_truth_ordering_numerical, predicted_ordering_numerical)
print(res.statistic, res.pvalue)

In [None]:
res = stats.kendalltau(ground_truth_ordering_numerical_growth_rate, predicted_ordering_numerical_growth_rate)
print(res.statistic, res.pvalue)

In [None]:
plt.figure(figsize=(12, 8))
ax = sns.barplot(tf_df_df, x="Year", y="TF-IDF", hue="Term")
plt.xticks(rotation=0, ha='center')
plt.tight_layout()
technologies_bars_heights = [[bar.get_height() for bar in bars] for bars in ax.containers]      # Extract height of each bar
technologies_bars_centers = [[bar.get_x() + bar.get_width()/2 for bar in bars] for bars in ax.containers]
technologies_bars_colors = [[bar.get_facecolor() for bar in bars] for bars in ax.containers]
tf_df_scores_2019 = np.array([elem if elem else 1 for elem in tf_df_scores_2019])
tf_df_scores_2021 = np.array([elem if elem else 1 for elem in tf_df_scores_2021])
tf_df_scores_2023 = np.array([elem if elem else 1 for elem in tf_df_scores_2023])
tf_df_scores_2025 = np.array([elem if elem else 1 for elem in tf_df_scores_2025])

growth_rates_2019_2021 = (tf_df_scores_2021-tf_df_scores_2019)/tf_df_scores_2019
growth_rates_2021_2023 = (tf_df_scores_2023-tf_df_scores_2021)/tf_df_scores_2021
growth_rates_2023_2025 = (tf_df_scores_2025-tf_df_scores_2023)/tf_df_scores_2023
# Add text annotations on top of each bar
for technology_bars_heights, technology_bars_centers, technology_bars_colors, growth_rate_1, growth_rate_2 in zip(technologies_bars_heights, technologies_bars_centers, technologies_bars_colors, growth_rates_2021_2023, growth_rates_2023_2025):            # Loop through bars and heights together
    bar_1_x, bar_1_y = technology_bars_centers[0], technology_bars_heights[0]
    bar_2_x, bar_2_y = technology_bars_centers[1], technology_bars_heights[1]
    bar_3_x, bar_3_y = technology_bars_centers[2], technology_bars_heights[2]
    curve_color = technology_bars_colors[0]
    plt.plot([bar_1_x, bar_2_x], [bar_1_y, bar_2_y], color=curve_color, linewidth=0.5)
    plt.plot([bar_2_x, bar_3_x], [bar_2_y, bar_3_y], color=curve_color, linewidth=0.5)
    ax.text(
        (bar_1_x+bar_2_x)/2,        # X position (center of bar)
        (bar_1_y+bar_2_y)/2,                                   # Y position (top of bar)
        f'{growth_rate_1:.2f}',                        # Text (format as currency)
        ha='center',                             # Horizontal alignment
        va='bottom'                              # Vertical alignment
    )
    ax.text(
        (bar_2_x+bar_3_x)/2,        # X position (center of bar)
        (bar_2_y+bar_3_y)/2,                                   # Y position (top of bar)
        f'{growth_rate_2:.2f}',                        # Text (format as currency)
        ha='center',                             # Horizontal alignment
        va='bottom'                              # Vertical alignment
    )
# plt.savefig("tf_df_raw_tf_xaxis_year_with_growth_rates.png")
plt.show()

In [None]:
# plt.figure(figsize=(12, 8))
from matplotlib import colormaps

cmap = colormaps['tab20']
colors = cmap.colors[:len(unique_technologies)]

technology_to_color = dict(zip(unique_technologies, colors))

fixed_technologies = ["AI", "Quantum Computing", "AR & VR", "Robotics", "Biotech", "New Materials", "Electronics & Photonics"]

plot_years = [2021, 2023, 2025]
for technology, growth_2021, growth_2023, growth_2025 in zip(fixed_technologies, growth_rates_2019_2021, growth_rates_2021_2023, growth_rates_2023_2025):
    plt.plot(plot_years, [growth_2021, growth_2023, growth_2025], marker='o', label=technology, color=technology_to_color[technology])

plt.xticks(plot_years)
plt.xlabel('Year')
plt.ylabel('Growth Rate')
# plt.title('Growth Rate of Technologies from 2021 to 2025')
plt.legend(title='Technology')
plt.grid(True)
plt.legend(loc='upper right')
plt.tight_layout()
plt.savefig("growth_rates_2021_2023_2025.png")
plt.show()

In [None]:
from matplotlib import colormaps

cmap = colormaps['tab20']
colors = cmap.colors[:len(unique_technologies)]

technology_to_color = dict(zip(unique_technologies, colors))

fixed_technologies = ["AI", "Quantum Computing", "AR & VR", "Robotics", "Biotech", "New Materials", "Electronics & Photonics"]
tech_colors = [technology_to_color[technology] for technology in fixed_technologies]

plt.figure(figsize=(12, 6))
ax = sns.barplot(tf_idf_df, x="Term", y="TF-IDF", hue="Year", legend=False)
new_xticklabels = ["2021 2023 2025\n" + xticklabel.get_text() for xticklabel in ax.get_xticklabels()]
ax.set_xticklabels(new_xticklabels, rotation=0, ha='center')

plt.tight_layout()

plot_years = [2021, 2023, 2025]
technologies_bars_heights = [[ax.containers[i][j].get_height() for i in range(len(plot_years))] for j in range(len(fixed_technologies))]
technologies_bars_centers = [[ax.containers[i][j].get_x() + ax.containers[i][j].get_width()/2 for i in range(len(plot_years))] for j in range(len(fixed_technologies))]
technologies_bars_colors = [[ax.containers[i][j].get_facecolor() for i in range(len(plot_years))] for j in range(len(fixed_technologies))]
# tf_df_scores_2019 = np.array([elem if elem else 1 for elem in tf_df_scores_2019])
# tf_df_scores_2021 = np.array([elem if elem else 1 for elem in tf_df_scores_2021])
# tf_df_scores_2023 = np.array([elem if elem else 1 for elem in tf_df_scores_2023])
# tf_df_scores_2025 = np.array([elem if elem else 1 for elem in tf_df_scores_2025])

# growth_rates_2019_2021 = (tf_df_scores_2021-tf_df_scores_2019)/tf_df_scores_2019
# growth_rates_2021_2023 = (tf_df_scores_2023-tf_df_scores_2021)/tf_df_scores_2021
# growth_rates_2023_2025 = (tf_df_scores_2025-tf_df_scores_2023)/tf_df_scores_2023

# Add text annotations on top of each bar
for i, (technology_bars_heights, technology_bars_centers, color, growth_rate_1, growth_rate_2, growth_rate_3) in enumerate(zip(technologies_bars_heights, technologies_bars_centers, tech_colors, growth_rates_2019_2021, growth_rates_2021_2023, growth_rates_2023_2025)):
    bar_1_x, bar_1_y = technology_bars_centers[0], technology_bars_heights[0]
    bar_2_x, bar_2_y = technology_bars_centers[1], technology_bars_heights[1]
    bar_3_x, bar_3_y = technology_bars_centers[2], technology_bars_heights[2]

    # change the bars colors to stay consistent with previous plots i.e. each technology has the same color
    ax.containers[0][i].set_facecolor(color)
    ax.containers[1][i].set_facecolor(color)
    ax.containers[2][i].set_facecolor(color)

    plt.plot([bar_1_x, bar_2_x], [bar_1_y, bar_2_y], color=color, linewidth=0.5)
    plt.plot([bar_2_x, bar_3_x], [bar_2_y, bar_3_y], color=color, linewidth=0.5)
    ax.text(
        bar_1_x,
        bar_1_y,
        f'{growth_rate_1:.2f}',
        ha='center',
        va='bottom',
        fontsize=9
    )
    ax.text(
        bar_2_x,
        bar_2_y,
        f'{growth_rate_2:.2f}',
        ha='center',
        va='bottom',
        fontsize=9
    )
    ax.text(
        bar_3_x,        # X position (center of bar)
        bar_3_y,                                   # Y position (top of bar)
        f'{growth_rate_3:.2f}',                        # Text (format as currency)
        ha='center',
        va='bottom',
        fontsize=9                             # Vertical alignment
    )
plt.savefig("tf_idf_normalized_tf_xaxis_term_with_growth_rates.png")
plt.show()

In [None]:
sns.barplot(tf_idf_df, x="Year", y="TF-IDF", hue="Term", palette=tech_colors)
plt.xticks(rotation=0, ha='center')
plt.tight_layout()
plt.savefig("tf_idf_normalized_tf_xaxis_year.png")
plt.show()