In [None]:
pip install faiss-cpu

In [None]:
pip install pypdf

In [None]:
pip install dotenv

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/"MA4 semester project"

In [None]:
import torch
import faiss
import pypdf
import json
import os
import numpy as np
import matplotlib.pyplot as plt

from openai import OpenAI
from dotenv import load_dotenv
from huggingface_hub import HfApi, hf_hub_download
from io import BytesIO
from utils import ChunkLoader, RAGEncoder, retrieve_relevant_chunks, get_all_chunks, show_chunks_statistics, get_index_and_chunks, get_formatted_prompt, generate_answer
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on: ", device)

load_dotenv()  # Loads environment variables from a .env file
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
HUGGING_FACE_TOKEN = os.getenv("HUGGING_FACE_TOKEN")

In [None]:
MAX_CHUNK_SIZE = 256
OVERLAP = int(0.2*MAX_CHUNK_SIZE)

# # Setting the LLM used for chunk embedding
# chunk_encoder_tokenizer = AutoTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
# chunk_encoder_model = AutoModel.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
# chunk_encoder = RAGEncoder(chunk_encoder_tokenizer, chunk_encoder_model, device)

# # Setting the LLM used for question embedding
# question_encoder_tokenizer = AutoTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
# question_encoder_model = AutoModel.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
# question_encoder = RAGEncoder(question_encoder_tokenizer, question_encoder_model, device)

rag_encoder_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# rag_encoder_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
# rag_encoder_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
rag_encoder = RAGEncoder(rag_encoder_model.tokenizer, rag_encoder_model, device)
chunk_encoder = rag_encoder
question_encoder = rag_encoder

# RAG preparation: chunking, embedding and uploading to HF hub

In [None]:
# folder path containing subfolders which contain the rag documents (the parent folder is divided into subfolders with different timespans)
PARENT_FOLDER = "./RAG documents"

def rag_chunk_embed_and_upload(parent_folder, rag_encoder, max_chunk_size, overlap, prefix, hugging_face_write_token, show_statistics):
  api = HfApi(token=hugging_face_write_token)

  for sub_folder in tqdm(os.listdir(parent_folder), desc="reading parent folder " + parent_folder + " ..."):
    #chunking the documents of the current sub_folder
    all_chunks = get_all_chunks(parent_folder + "/" + sub_folder, rag_encoder.tokenizer, max_chunk_size, overlap)

    if show_statistics:
      show_chunks_statistics(all_chunks, rag_encoder.tokenizer)

    # transform the chunks list into a jsonl file (we use BytesIO to avoid having to store the file locally)
    all_chunks_jsonl_content = "\n".join([json.dumps({"chunk": chunk}) for chunk in all_chunks])
    all_chunks_jsonl_file = BytesIO(all_chunks_jsonl_content.encode("utf-8"))

    # upload the jsonl file
    api.upload_file(
        path_or_fileobj=all_chunks_jsonl_file,
        path_in_repo=prefix + "_chunks_" + sub_folder + ".jsonl",
        repo_id="ziedM/rag_dataset",
        repo_type="dataset",
    )

    # vectorize the chunks (again we use BytesIO for the same reason as above)
    all_vectors_npy_file = BytesIO()
    all_vectors_npy_content = np.vstack([rag_encoder.encode_text(chunk) for chunk in tqdm(all_chunks, desc="vectorizing the chunks ...")])
    np.save(all_vectors_npy_file, all_vectors_npy_content)
    all_vectors_npy_file.seek(0)  # set the offset back to the beginning of the stream

    # upload the embeddings/vectors (we choose numpy as we're using faiss vector DB for similarity search later in the RAG pipeline)
    api.upload_file(
        path_or_fileobj=all_vectors_npy_file,
        path_in_repo=prefix + "_embeddings_" + sub_folder + ".npy",
        repo_id="ziedM/rag_dataset",
        repo_type="dataset",
    )

rag_chunk_embed_and_upload(PARENT_FOLDER, chunk_encoder, MAX_CHUNK_SIZE, OVERLAP, "sentence_transformer", HUGGING_FACE_TOKEN, True)

# Retrieval Augmented Generation

In [None]:
# Setting the LLMs used for generation (we are using 3 LLMs GPT2, GPT-3.5 Turbo and GPT-4o)

# GPT2 from HF: (knowledge cutoff: November 2019)
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2").to(device)
tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
embedding_layer = model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

#The embedding size for GPT2 small is 768
print("The vocabulary length for the GPT2 small model is:", len(tokenizer))
print("The number of parameters for the GPT2 small model is:", sum(p.numel() for p in model.parameters()))
print("The max sentence length for this model is {}".format(tokenizer.model_max_length))
print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
print("The pad token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))

# This OpenAI client is used to chat with GPT-3.5 Turbo (knowledge cutoff: September 2021) and GPT-4o
# (knowledge cutoff: October 2023) through OpenAI's platform (as these models are not available publicly)
client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
# instruction 1.1 is used to get the prediction and instruction 1.2 is used to get the assesment
TOP_K_FIRST_MODEL = 3
TOP_K_SECOND_MODEL = 5
TOP_K_THIRD_MODEL = 7
RESULT_DIR = "result_sentence_transformer" # or result_facebook_dpr

role = "You are an expert in deep technologies."
instruction_1_0 = "Answer the question below. Keep your response concise (maximum 15 lines)."
instruction_1_1 = "Using the provided context, answer the question below. Keep your response concise (maximum 15 lines)."
instruction_1_2 = "Based on your current knowledge, evaluate the accuracy of the following prediction about deep technologies in {}. Assess whether the technologies mentioned: - Had the predicted economic or societal impact - Gained significant adoption or traction - Were delayed, overestimated, or abandoned. Provide a short analysis of how accurate the prediction was, citing relevant developments if possible. Prediction: {}.\nAssessment: "

In [None]:
question = """Answer the following in three parts:\n\nTop 5 deep technologies by economic impact (for the year {period}).\nRank from highest to lowest impact.\nFor each: name the tech, estimate the percentage of economic impact (e.g., cost savings, productivity gains) the technology will have, with 100% indicating a strong effect and 0% indicating no effect, explain briefly.\n\nTop 5 deep technologies by societal impact (same period).\nRank from highest to lowest.\nFor each: name the tech, describe key social changes (e.g., health, privacy, ethics), explain briefly.\n\nTop 5 emerging deep technologies.\nRank from most to least likely to become impactful in the year {period}.\nFor each: name the tech and briefly state why it’s emerging.\n\nOverlap between lists is allowed."""
print(question.format(period=2021))

# Idea 0

In [None]:
# prompt 1 is for gpt-2 to get prediction 1.
# prompt 2 is for gpt-3.5 turbo to get assesment 1.
# prompt 3 is for gpt-3.5 turbo to get prediction 2
# prompt 4 is for gpt-4o to get assesment 2.
# prompt 5 is for gpt-4o to get prediction 3

prompt_1 = get_formatted_prompt(role + " " + instruction_1_0, context=None, question=question.format(period=2021), RAG=False)
prediction_1 = generate_answer(prompt_1, tokenizer, model, device)
prediction_1 = prediction_1[prediction_1.index("Answer: ")+8:].strip(' ')


prompt_2 = instruction_1_2.format(2021, prediction_1)
assessment_1 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
  {"role": "system", "content": role},
  {"role": "user", "content": prompt_2}
]
).choices[0].message.content

prompt_3 = get_formatted_prompt(role + " " + instruction_1_0, context=None, question=question.format(period=2023), RAG=False)
prediction_2 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
  {"role": "user", "content": prompt_3}
]
).choices[0].message.content



prompt_4 = instruction_1_2.format(2023, prediction_2)
assessment_2 = client.chat.completions.create(
    model="gpt-4o",
    messages=[
  {"role": "system", "content": role},
  {"role": "user", "content": prompt_4}
]
).choices[0].message.content

prompt_5 = get_formatted_prompt(role + " " + instruction_1_0, context=None, question=question.format(period=2025), RAG=False)
prediction_3 = client.chat.completions.create(
    model="gpt-4o",
    messages=[
  {"role": "user", "content": prompt_5}
]
).choices[0].message.content


result = {
    "prediction_1": prediction_1,
    "assessment_1": assessment_1,
    "prediction_2": prediction_2,
    "assessment_2": assessment_2,
    "prediction_3": prediction_3
}

with open(RESULT_DIR+"/"+"results_idea0.json", "w") as f:
  json.dump(result, f)

# Idea 1


In [None]:
# prompt 1 is for gpt-2 to get prediction 1.
# prompt 2 is for gpt-3.5 turbo to get assesment 1.
# prompt 3 is for gpt-3.5 turbo to get prediction 2
# prompt 4 is for gpt-4o to get assesment 2.
# prompt 5 is for gpt-4o to get prediction 3

index, chunks = get_index_and_chunks("sentence_transformer_chunks_2019_2020.jsonl", "sentence_transformer_embeddings_2019_2020.npy")
relevant_chunks = retrieve_relevant_chunks(question, index, chunks, question_encoder, topk=TOP_K_FIRST_MODEL, normalize=True)
context = "\n".join(relevant_chunks)
prompt_1 = get_formatted_prompt(role + " " + instruction_1_1, context, question.format(period=2021), RAG=True)
prediction_1 = generate_answer(prompt_1, tokenizer, model, device)
prediction_1 = prediction_1[prediction_1.index("Answer: ")+8:].strip(' ')


prompt_2 = instruction_1_2.format(2021, prediction_1)
assessment_1 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
  {"role": "system", "content": role},
  {"role": "user", "content": prompt_2}
]
).choices[0].message.content

index, chunks = get_index_and_chunks("sentence_transformer_chunks_2021_2022.jsonl", "sentence_transformer_embeddings_2021_2022.npy")
relevant_chunks = retrieve_relevant_chunks(question, index, chunks, question_encoder, topk=TOP_K_SECOND_MODEL, normalize=True)
context = "\n".join(relevant_chunks)
prompt_3 = get_formatted_prompt(role + " " + instruction_1_1, context, question.format(period=2023), RAG=True)
prediction_2 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
  {"role": "user", "content": prompt_3}
]
).choices[0].message.content



prompt_4 = instruction_1_2.format(2023, prediction_2)
assessment_2 = client.chat.completions.create(
    model="gpt-4o",
    messages=[
  {"role": "system", "content": role},
  {"role": "user", "content": prompt_4}
]
).choices[0].message.content

index, chunks = get_index_and_chunks("sentence_transformer_chunks_2023_2024.jsonl", "sentence_transformer_embeddings_2023_2024.npy")
relevant_chunks = retrieve_relevant_chunks(question, index, chunks, question_encoder, topk=TOP_K_THIRD_MODEL, normalize=True)
context = "\n".join(relevant_chunks)
prompt_5 = get_formatted_prompt(role + " " + instruction_1_1, context, question.format(period=2025), RAG=True)
prediction_3 = client.chat.completions.create(
    model="gpt-4o",
    messages=[
  {"role": "user", "content": prompt_5}
]
).choices[0].message.content


result = {
    "prediction_1": prediction_1,
    "assessment_1": assessment_1,
    "prediction_2": prediction_2,
    "assessment_2": assessment_2,
    "prediction_3": prediction_3
}

with open(RESULT_DIR+"/"+"results_idea1.json", "w") as f:
  json.dump(result, f)

# Idea 2

In [None]:
# prompt 1 is for gpt-2 to get prediction 1.
# prompt 2 is for gpt-3.5 turbo to get assesment 1.
# prompt 3 is for gpt-3.5 turbo to get prediction 2
# prompt 4 is for gpt-4o to get assesment 2.
# prompt 5 is for gpt-4o to get prediction 3

index, chunks = get_index_and_chunks("sentence_transformer_chunks_past_2020.jsonl", "sentence_transformer_embeddings_past_2020.npy")
relevant_chunks = retrieve_relevant_chunks(question, index, chunks, question_encoder, topk=TOP_K_FIRST_MODEL, normalize=True)
context = "\n".join(relevant_chunks)
prompt_1 = get_formatted_prompt(role + " " + instruction_1_1, context, question.format(period=2021), RAG=True)
prediction_1 = generate_answer(prompt_1, tokenizer, model, device)
prediction_1 = prediction_1[prediction_1.index("Answer: ")+8:].strip(' ')


prompt_2 = instruction_1_2.format(2021, prediction_1)
assessment_1 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
  {"role": "system", "content": role},
  {"role": "user", "content": prompt_2}
]
).choices[0].message.content

index, chunks = get_index_and_chunks("sentence_transformer_chunks_past_2022.jsonl", "sentence_transformer_embeddings_past_2022.npy")
relevant_chunks = retrieve_relevant_chunks(question, index, chunks, question_encoder, topk=TOP_K_SECOND_MODEL, normalize=True)
context = "\n".join(relevant_chunks)
prompt_3 = get_formatted_prompt(role + " " + instruction_1_1, context, question.format(period=2023), RAG=True)
prediction_2 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
  {"role": "user", "content": prompt_3}
]
).choices[0].message.content



prompt_4 = instruction_1_2.format(2023, prediction_2)
assessment_2 = client.chat.completions.create(
    model="gpt-4o",
    messages=[
  {"role": "system", "content": role},
  {"role": "user", "content": prompt_4}
]
).choices[0].message.content

index, chunks = get_index_and_chunks("sentence_transformer_chunks_past_2024.jsonl", "sentence_transformer_embeddings_past_2024.npy")
relevant_chunks = retrieve_relevant_chunks(question, index, chunks, question_encoder, topk=TOP_K_THIRD_MODEL, normalize=True)
context = "\n".join(relevant_chunks)
prompt_5 = get_formatted_prompt(role + " " + instruction_1_1, context, question.format(period=2025), RAG=True)
prediction_3 = client.chat.completions.create(
    model="gpt-4o",
    messages=[
  {"role": "user", "content": prompt_5}
]
).choices[0].message.content


result = {
    "prediction_1": prediction_1,
    "assessment_1": assessment_1,
    "prediction_2": prediction_2,
    "assessment_2": assessment_2,
    "prediction_3": prediction_3
}

with open(RESULT_DIR+"/"+"results_idea2.json", "w") as f:
  json.dump(result, f)

# Idea 3

In [None]:
# prompt 1 is for gpt-2 to get prediction 1.
# prompt 2 is for gpt-3.5 turbo to get assesment 1.
# prompt 3 is for gpt-3.5 turbo to get prediction 2
# prompt 4 is for gpt-4o to get assesment 2.
# prompt 5 is for gpt-4o to get prediction 3

index, chunks = get_index_and_chunks("sentence_transformer_chunks_past_2024.jsonl", "sentence_transformer_embeddings_past_2024.npy")
relevant_chunks = retrieve_relevant_chunks(question, index, chunks, question_encoder, topk=TOP_K_FIRST_MODEL, normalize=True)
context = "\n".join(relevant_chunks)
prompt_1 = get_formatted_prompt(role + " " + instruction_1_1, context, question.format(period=2025), RAG=True)
prediction_1 = generate_answer(prompt_1, tokenizer, model, device)
prediction_1 = prediction_1[prediction_1.index("Answer: ")+8:].strip(' ')


prompt_2 = instruction_1_2.format(2025, prediction_1)
assessment_1 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
  {"role": "system", "content": role},
  {"role": "user", "content": prompt_2}
]
).choices[0].message.content

index, chunks = get_index_and_chunks("sentence_transformer_chunks_past_2024.jsonl", "sentence_transformer_embeddings_past_2024.npy")
relevant_chunks = retrieve_relevant_chunks(question, index, chunks, question_encoder, topk=TOP_K_SECOND_MODEL, normalize=True)
context = "\n".join(relevant_chunks)
prompt_3 = get_formatted_prompt(role + " " + instruction_1_1, context, question.format(period=2025), RAG=True)
prediction_2 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
  {"role": "user", "content": prompt_3}
]
).choices[0].message.content



prompt_4 = instruction_1_2.format(2025, prediction_2)
assessment_2 = client.chat.completions.create(
    model="gpt-4o",
    messages=[
  {"role": "system", "content": role},
  {"role": "user", "content": prompt_4}
]
).choices[0].message.content

index, chunks = get_index_and_chunks("sentence_transformer_chunks_past_2024.jsonl", "sentence_transformer_embeddings_past_2024.npy")
relevant_chunks = retrieve_relevant_chunks(question, index, chunks, question_encoder, topk=TOP_K_THIRD_MODEL, normalize=True)
context = "\n".join(relevant_chunks)
prompt_5 = get_formatted_prompt(role + " " + instruction_1_1, context, question.format(period=2025), RAG=True)
prediction_3 = client.chat.completions.create(
    model="gpt-4o",
    messages=[
  {"role": "user", "content": prompt_5}
]
).choices[0].message.content


result = {
    "prediction_1": prediction_1,
    "assessment_1": assessment_1,
    "prediction_2": prediction_2,
    "assessment_2": assessment_2,
    "prediction_3": prediction_3
}

with open(RESULT_DIR+"/"+"results_idea3.json", "w") as f:
  json.dump(result, f)

# Idea 4

In [None]:
# only gpt-2 is used for all predictions and assessments.

index, chunks = get_index_and_chunks("sentence_transformer_chunks_past_2020.jsonl", "sentence_transformer_embeddings_past_2020.npy")
relevant_chunks = retrieve_relevant_chunks(question, index, chunks, question_encoder, topk=TOP_K_FIRST_MODEL, normalize=True)
context = "\n".join(relevant_chunks)
prompt_1 = get_formatted_prompt(role + " " + instruction_1_1, context, question.format(period=2021), RAG=True)
prediction_1 = generate_answer(prompt_1, tokenizer, model, device)
prediction_1 = prediction_1[prediction_1.index("Answer: ")+8:].strip(' ')

prompt_2 = instruction_1_2.format(2021, prediction_1)
assessment_1 = generate_answer(role + "\n" + prompt_2, tokenizer, model, device)
assessment_1 = assessment_1[assessment_1.index("Assessment: ")+12:].strip(' ')

index, chunks = get_index_and_chunks("sentence_transformer_chunks_past_2022.jsonl", "sentence_transformer_embeddings_past_2022.npy")
relevant_chunks = retrieve_relevant_chunks(question, index, chunks, question_encoder, topk=TOP_K_FIRST_MODEL, normalize=True)
context = "\n".join(relevant_chunks)
prompt_3 = get_formatted_prompt(role + " " + instruction_1_1, context, question.format(period=2023), RAG=True)
prediction_2 = generate_answer(prompt_3, tokenizer, model, device)
prediction_2 = prediction_2[prediction_2.index("Answer: ")+8:].strip(' ')

prompt_4 = instruction_1_2.format(2023, prediction_2)
assessment_2 = generate_answer(role + "\n" + prompt_4, tokenizer, model, device)
assessment_2 = assessment_2[assessment_2.index("Assessment: ")+12:].strip(' ')

index, chunks = get_index_and_chunks("sentence_transformer_chunks_past_2024.jsonl", "sentence_transformer_embeddings_past_2024.npy")
relevant_chunks = retrieve_relevant_chunks(question, index, chunks, question_encoder, topk=TOP_K_FIRST_MODEL, normalize=True)
context = "\n".join(relevant_chunks)
prompt_5 = get_formatted_prompt(role + " " + instruction_1_1, context, question.format(period=2025), RAG=True)
prediction_3 = generate_answer(prompt_5, tokenizer, model, device)
prediction_3 = prediction_3[prediction_3.index("Answer: ")+8:].strip(' ')

result = {
    "prediction_1": prediction_1,
    "assessment_1": assessment_1,
    "prediction_2": prediction_2,
    "assessment_2": assessment_2,
    "prediction_3": prediction_3
}

with open(RESULT_DIR+"/"+"results_idea4.json", "w") as f:
  json.dump(result, f)