In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os
import json
import re

import openai

import langchain

import langchain.document_loaders

from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
import os
import shutil

from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

In [3]:
# read from config.ini file

import configparser

config = configparser.ConfigParser()

config.read('../../config/config.ini')

SECRETS = config['SECRETS']

# set openai api key

os.environ['OPENAI_API_KEY'] = SECRETS['openapi_key']

In [4]:
def get_chunks(file_path):
    
    loader = PyPDFLoader(file_path)
    
    documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=100,
    length_function=len,
    add_start_index=True,
    )

    chunks = text_splitter.split_documents(documents)

    return chunks

In [5]:
def get_vectordb(chunks, CHROMA_PATH):

    CHROMA_PATH = f"../../data/chroma/{CHROMA_PATH}"

    if os.path.exists(CHROMA_PATH):
        db = Chroma(persist_directory=CHROMA_PATH, embedding_function=OpenAIEmbeddings())

    else:
        db = Chroma.from_documents(
            chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
        )

        db.persist()

        print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
    
    return db

In [6]:
def gen_sample(chunk, db):

    PROMPT_TEMPLATE = """
    Answer the question based only on the following context:

    {context}

    ---

    Answer the question based on the above context: {question}
    """

    query_text = f"""

    Classify whether the given chunk involves a decision that will effect the story or not.

    A decision is defined as when the character goes about making a choice between two or more options. 
    The decision should be significant enough to affect the story in a major way.
    It doesn't really involve emotions, feelings or thoughts, but what the character does, or what happens to them.
    This involes interactions between characters, or the character and the environment.
    What isn't a decision is chunks describing the setting, or the character's thoughts or feelings.

    Generate response in a JSON with the following keys: ["decision", "text", "description"]

    decision: "yes"/"no"
    text: the chunk being passed in
    description: what the decision is

    ```{chunk.page_content}```

    """

    results = db.similarity_search_with_relevance_scores(query_text, k=5)

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)

    model = ChatOpenAI()
    response_text = model.predict(prompt)

    return eval(response_text)


In [7]:
base_path = "../../data/pdf"

samples = []

pdfs = os.listdir(base_path)

pdf_path = f"{base_path}/{pdfs[0]}"

from pathlib import Path

Path(pdf_path).stem

'dune'

In [8]:
from tqdm import tqdm

import random

base_path = "../../data/pdf"

samples = []

pdfs = os.listdir(base_path)

for pdf in pdfs:

    print(f"Processing {pdf}... {pdfs.index(pdf)}/{len(pdfs)}")
    
    pdf_path = f"{base_path}/{pdf}"

    CHROMA_PATH = pdf_path.split("/")[-1].split(".")[0]

    chunks = get_chunks(pdf_path)

    # print(f"Found {len(chunks)} chunks in {pdf_path}.")

    db = get_vectordb(chunks, CHROMA_PATH)

    try:

        chunks = random.sample(chunks, 1000)
    
    except:
        chunks = chunks

    for chunk in tqdm(chunks):

        try:

            sample = gen_sample(chunk, db)

            sample['source'] = Path(pdf_path).stem

            samples.append(sample)
        except:
            continue
    
    print(f"\n{'-----'*50}\n")

Processing dune.pdf... 0/10


  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
100%|██████████| 715/715 [1:14:18<00:00,  6.24s/it]



----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Processing eg.pdf... 1/10


100%|██████████| 1000/1000 [1:51:53<00:00,  6.71s/it]



----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Processing gr.pdf... 2/10


100%|██████████| 1000/1000 [2:13:51<00:00,  8.03s/it] 



----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Processing in_mb.pdf... 3/10


100%|██████████| 1000/1000 [1:48:51<00:00,  6.53s/it] 



----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Processing in_rm.pdf... 4/10


100%|██████████| 1000/1000 [1:56:37<00:00,  7.00s/it] 



----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Processing in_rm_2.pdf... 5/10


100%|██████████| 1000/1000 [1:23:40<00:00,  5.02s/it]



----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Processing kgf.pdf... 6/10


100%|██████████| 15/15 [01:52<00:00,  7.49s/it]



----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Processing khaleja.pdf... 7/10


100%|██████████| 397/397 [33:16<00:00,  5.03s/it]



----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Processing nr.pdf... 8/10


100%|██████████| 1000/1000 [1:30:23<00:00,  5.42s/it]



----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Processing pokiri.pdf... 9/10


100%|██████████| 297/297 [24:06<00:00,  4.87s/it]



----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------



In [12]:
len(samples)

6272

In [13]:
import pandas as pd

dec_df = pd.DataFrame(samples)

In [14]:
dec_df.head()

Unnamed: 0,decision,text,description,source
0,no,"massive mining vehicle, a HARVESTER, kicking u...",This chunk describes the setting and actions o...,dune
1,no,being hauled by a powerful CARRYALL. ON THE GR...,This chunk describes the setting and actions o...,dune
2,no,SPICE VISION: EXT. ARRAKIS - DESERT - DAY 95C ...,This chunk describes a vision of the future an...,dune
3,no,"strange MISSLE LAUNCHER, one of multiple cloth...",This chunk describes the setting and the chara...,dune
4,no,Flickering Fremen PLASMA LASERS lance up at th...,This chunk describes action scenes involving t...,dune


In [15]:
dec_df['decision'].value_counts()

decision
no     5132
yes    1140
Name: count, dtype: int64

In [16]:
dec_df.to_csv("../../data/output/decisions_new.csv", index=False)