In [13]:
import os
import json
from typing import List

import shutil
# Load environment
from dotenv import load_dotenv
# OpenAPI model
import openai
from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.chat_models import init_chat_model
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import Language
from langchain_openai import OpenAIEmbeddings  
# Embedding model
from langchain.document_loaders import TextLoader
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

In [14]:
# android permission direction
android_permission_directory = r"C:\Users\ASUS\anaconda3-project-code\student-rag-example\android-permission"
# android permission vectorDB direction
android_permission_vectordb_directory = r"C:\Users\ASUS\anaconda3-project-code\student-rag-example\android-permission-vectorDB"

In [15]:
# android datasafety direction
andoird_datasafety_directory = r"C:\Users\ASUS\anaconda3-project-code\student-rag-example\andoird-datasafety"
# android datasafety vectorDB direction
andoird_datasafety_vectordb_directory = r"C:\Users\ASUS\anaconda3-project-code\student-rag-example\andoird-datasafety-vectorDB"

In [16]:
# Setup model
# Load environment variables
load_dotenv()

# Retrieve API key
api_key = os.getenv("OPENAI_API_KEY")
# Ensure the API key is correctly set
if not api_key:
    raise ValueError("OPENAI_API_KEY is not set in the environment variables")

# Initialize the ChatOpenAI model
llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0,
    openai_api_key=api_key  # Ensure you explicitly pass the API key
)

In [17]:
def read_json_documents_from_directory(directory_path: str) -> List[Document]:
    """
    Read all JSON files from the given directory and return them as LangChain Documents.
    Each Document has:
      - page_content: JSON content as a string
      - metadata: includes source filename
    """
    documents = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".json"):
            file_path = os.path.join(directory_path, filename)
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    json_data = json.load(f)
                    content = json.dumps(json_data, indent=2)
                    doc = Document(page_content=content, metadata={"source": file_path})
                    documents.append(doc)
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
    return documents
def index_json_documents_to_faiss(documents: List[Document],faiss_index_path: str,index_name: str = "index"):
    """
    Convert LangChain documents to embeddings and store them in FAISS index.
    If index already exists, appends new documents.
    """
    if not documents:
        print("No documents to index.")
        return

    # Split documents into smaller chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = splitter.split_documents(documents)

    if len(chunks) == 0:
        print("No valid chunks generated.")
        return

    embeddings = OpenAIEmbeddings()

    faiss_file = os.path.join(faiss_index_path, f"{index_name}.faiss")
    if os.path.exists(faiss_file):
        vectorstore = FAISS.load_local(
            folder_path=faiss_index_path,
            index_name=index_name,
            embeddings=embeddings,
            allow_dangerous_deserialization=True
        )
        vectorstore.add_documents(chunks)
        print("Appended documents to existing FAISS index.")
    else:
        vectorstore = FAISS.from_documents(chunks, embeddings)
        print("Created new FAISS index.")

    vectorstore.save_local(faiss_index_path, index_name=index_name)
    print(f"Total documents in index: {len(vectorstore.docstore._dict)}")

In [18]:
docs = read_json_documents_from_directory(android_permission_directory)
print(docs)

[Document(metadata={'source': 'C:\\Users\\ASUS\\anaconda3-project-code\\student-rag-example\\android-permission\\ACCESS_COARSE_LOCATION.json'}, page_content='{\n  "permission_name": "ACCESS_COARSE_LOCATION",\n  "permission_describe": "Allows an app to access approximate location",\n  "permission_level": "dangerous",\n  "constant Value": "android.permission.ACCESS_COARSE_LOCATION"\n}'), Document(metadata={'source': 'C:\\Users\\ASUS\\anaconda3-project-code\\student-rag-example\\android-permission\\ACCESS_FINE_LOCATION.json'}, page_content='{\n  "permission_name": "ACCESS_FINE_LOCATION",\n  "permission_describe": "Allows an app to access precise location. Alternatively, you might want ACCESS_COARSE_LOCATION",\n  "permission_level": "dangerous",\n  "constant Value": "android.permission.ACCESS_FINE_LOCATION"\n}'), Document(metadata={'source': 'C:\\Users\\ASUS\\anaconda3-project-code\\student-rag-example\\android-permission\\ACCESS_MEDIA_LOCATION.json'}, page_content='{\n  "permission_name":

In [19]:
index_json_documents_to_faiss(docs, android_permission_vectordb_directory, index_name="permissions")

Created new FAISS index.
Total documents in index: 3
