Assignment 3

# Data Preparation - Embedding

* Arxiv AI papers 


## Task : Embedding on Arxiv AI papers

#### Setup Azure OpenAI - "2023-09-15-preview" This version does not support ChatCompletion, but for Embedding it suports

In [48]:
import os
import openai
from dotenv import load_dotenv

# Set up Azure OpenAI
load_dotenv()

# openai.api_type = "azure"
# openai.api_base = "" # Api base is the 'Endpoint' which can be found in Azure Portal where Azure OpenAI is created. It looks like https://xxxxxx.openai.azure.com/
# openai.api_version = "2023-07-01-preview"
# openai.api_key = "" # Or os.getenv("OPENAI_API_KEY") using local .env file. For more details, please see https://github.com/theskumar/python-dotenv

openai.api_type = "azure"
# openai.api_version = "2023-07-01-preview"
openai.api_version = "2023-09-15-preview"
API_KEY = os.getenv("OPENAI_API_KEY","").strip()
assert API_KEY, "ERROR: Azure OpenAI Key is missing"
openai.api_key = API_KEY
RESOURCE_ENDPOINT = os.getenv("OPENAI_API_ENDPOINT","").strip()
assert RESOURCE_ENDPOINT, "ERROR: Azure OpenAI Endpoint is missing"
assert "openai.azure.com" in RESOURCE_ENDPOINT.lower(), "ERROR: Azure OpenAI Endpoint should be in the form: \n\n\t<your unique endpoint identifier>.openai.azure.com"
openai.api_base = RESOURCE_ENDPOINT
# Deployment for embedding
DEPLOYMENT_NAME_EMBEDDING = os.getenv('DEPLOYMENT_NAME_EMBEDDING')

#### Embedding

In [49]:
def add_embedding(df):
    df['embedding'] = ''

    for i in range(len(df)):
        try:
            # Do embedding on page_content
            embedding = openai.Embedding().create(input=df['page_content'][i], deployment_id=DEPLOYMENT_NAME_EMBEDDING)
            # Assign embedding result to the 'embedding' column in the DataFrame
            embedding_value = embedding['data'][0]['embedding']
            if not isinstance(embedding_value, str):
                # Handle the case if the value is not a string, convert to string explicitly
                embedding_value = str(embedding_value)
            df.loc[i, 'embedding'] = embedding_value
        except Exception as err:
            print(f"Error: Index={i} {err=}, {type(err)=}")
    return df


#### Load AI documents which are previously processed from PDF to JSON

In [50]:
import pandas as pd

# Ask the user to provide the folder path for JSON files
# json_folder_path = input('Enter the path to the folder containing JSON files: ')

json_folder_path = './data_source/arxiv.org/AI/formrecognized_output'

# Check if the provided path exists
if not os.path.exists(json_folder_path):
    print(f'The folder path "{json_folder_path}" does not exist.')
else:
    # Get the parent directory of the JSON folder
    parent_folder = os.path.dirname(json_folder_path)

    # Create a new folder path one level above the parent folder
    embedding_folder_path = os.path.join(parent_folder, 'embedding_output')

    # Check if the Embedding folder path exists
    if not os.path.exists(embedding_folder_path):
        # If it doesn't exist, create the new folder
        os.makedirs(embedding_folder_path)
        print(f'Embedding folder "{embedding_folder_path}" created.')
    else:
        # If it exists, print a message
        print(f'Embedding folder "{embedding_folder_path}" already exists.')

    # Loop through the files in the folder
    for filename in os.listdir(json_folder_path):
        # Check if the file is a JSON file
        if filename.lower().endswith('.json'):
            # If it's a JSON file, print the file name and read its content
            file_path = os.path.join(json_folder_path, filename)
            print(f'Found JSON file: {filename}, Full Path: {file_path}')

            df_orig = pd.read_json(file_path, orient='records')
            df = df_orig.copy()

            # 1. Add embedding into 
            df_with_embedding = add_embedding(df)
            print(df_with_embedding)

            # 2. Save the data to a .csv file
            # Find the position of '_formrecognized'
            formrecognized_index = filename.find('_formrecognized')
            # Extract the substring before '_formrecognized'
            embedding_filename = filename[:formrecognized_index] + '_embedding.csv'
            embedding_file_full_path = os.path.join(embedding_folder_path, embedding_filename)
            # Write the DataFrame to a CSV file
            df.to_csv(embedding_file_full_path, sep='\t', index=False)
            print(f"DataFrame data has been saved to {embedding_file_full_path}")
                
        # Break out of the loop after processing the first PDF file - testing only
        # break
    print('Finished processing JSON files.')
print("Summarization batch process is done! <<<")

Embedding folder "./data_source/arxiv.org/AI\embedding_output" already exists.
Found JSON file: 2311.05227.pdf_formrecognized.json, Full Path: ./data_source/arxiv.org/AI/formrecognized_output\2311.05227.pdf_formrecognized.json
         filename  page_number  \
0  2311.05227.pdf            1   
1  2311.05227.pdf            2   
2  2311.05227.pdf            3   
3  2311.05227.pdf            4   
4  2311.05227.pdf            5   
5  2311.05227.pdf            6   
6  2311.05227.pdf            7   
7  2311.05227.pdf            8   
8  2311.05227.pdf            9   

                                        page_content  \
0  Kantian Deontology Meets AI Alignment: Towards...   
1  In the AI alignment field, the challenge relie...   
2  The second formulation, Formula of Humanity, h...   
3  157 Korsgaard (2018)). Moreover, there is a pr...   
4  Definition 3.4. (Equal Treatment (ET)). A mode...   
5  4.2 Utilitarian AI Fairness Metrics In this se...   
6  With the Kantian approach there is a 

In [51]:
# Show first 5 rows with embeddings
df.head(50)

Unnamed: 0,filename,page_number,page_content,embedding
0,2311.05227.pdf,1,Kantian Deontology Meets AI Alignment: Towards...,"[-0.012198702432215214, -0.031920600682497025,..."
1,2311.05227.pdf,2,"In the AI alignment field, the challenge relie...","[-0.004445330705493689, -0.02816370688378811, ..."
2,2311.05227.pdf,3,"The second formulation, Formula of Humanity, h...","[0.0024883956648409367, -3.8212587242014706e-0..."
3,2311.05227.pdf,4,"157 Korsgaard (2018)). Moreover, there is a pr...","[0.0017101240810006857, -0.010083627887070179,..."
4,2311.05227.pdf,5,Definition 3.4. (Equal Treatment (ET)). A mode...,"[-0.011129945516586304, -0.03496641293168068, ..."
5,2311.05227.pdf,6,4.2 Utilitarian AI Fairness Metrics In this se...,"[-0.004276075400412083, -0.017503313720226288,..."
6,2311.05227.pdf,7,With the Kantian approach there is a set of pr...,"[-0.005923197604715824, -0.014440690167248249,..."
7,2311.05227.pdf,8,"Fazelpour, S. and Lipton, Z. C. (2020). Algori...","[-0.01414605975151062, -0.02144012227654457, 0..."
8,2311.05227.pdf,9,"Posner, R. A. (1979). Utilitarianism, economic...","[0.004520314745604992, -0.03532026335597038, 0..."
