In [122]:
#!pip install langchain
#!pip install wikipedia
#!pip install llama-index
#!pip install openai
#!pip install neo4j
#!pip install PyPDFLoader
#!pip install pypdf
#!pip install spacy-llm
#!pip install --upgrade jupyter ipywidgets
#!pip install -q llama-index google-generativeai
#!pip install spacy-llm
#!python -m spacy download en_core_web_md


In [124]:
import openai
import os
import json
import spacy
from collections import Counter
from pathlib import Path
from wasabi import msg
from spacy_llm.util import assemble
from langchain.document_loaders import PyPDFLoader
from langchain.docstore.document import Document
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain import PromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
openai.api_key = 'OPENAI_API_KEY'
os.environ['OPENAI_API_KEY'] = ""

In [126]:
# Load the OpenAI Wikipedia page

raw_documents = WikipediaLoader(query="Ordenamiento territorial").load()

# Define chunking strategy
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=20
)
# Chunk the document
documents = text_splitter.split_documents(raw_documents)
for d in documents:
    del d.metadata["summary"]


In [127]:
# Directory containing your PDF files
directory_path = './data'

# Initialize PyPDFLoader for each PDF in the directory
loaders = [PyPDFLoader(os.path.join(directory_path, f)) for f in os.listdir(directory_path) if f.endswith('.pdf')]

# Load documents from PDFs
news_docs = []
for loader in loaders:
    news_docs.extend(loader.load())

In [128]:
# Prepare the content and metadata for each news article as Document objects
news_articles_data = [
    Document(
        page_content=doc.page_content,  # Assuming this is how you access the page content of the document
        metadata={
            "source": doc.metadata['source'].removeprefix('./data/'),  # Assuming this is the metadata format
            # Include any other metadata items here
        }
    )
    for doc in news_docs  # Assuming news_docs is a list of objects with page_content and metadata
]

In [None]:
# Initialize the text splitter
rtext_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)

# Initialize LLM
#model = Gemini(temperature=0.2, model="gemini-pro")
model = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-0125", openai_api_key=OPENAI_API_KEY)

# Define the map prompt template
map_template = """The following is a set of documents
{all_text_data}
Basado en la lista de documentos, por favor realiza resúmenes concisos mientras
extraes las relaciones esenciales para el análisis de relaciones posterior. Es importante
incluir las fechas de acciones o eventos, ya que son cruciales para el análisis de la
línea de tiempo posterior. Ejemplo: 'La junta de planificación territorial de Mendoza
aprueba la primera versión del Plan Provincial de Ordenamiento Territorial el 15/03/2014 (15 de marzo, sábado)',
lo que no solo muestra la relación entre la junta de planificación y el Plan Provincial de Ordenamiento Territorial,
sino también cuándo sucede."""


map_prompt = PromptTemplate.from_template(map_template)
print(map_prompt)
# Define the map_chain
map_chain = LLMChain(llm=model, prompt=map_prompt)

all_data = news_articles_data
# Extract text from each document
all_text_data = [doc.page_content for doc in all_text_data]

# Reduce
reduce_template = """The following is set of summaries:
{all_text_data}
Take these and distill them into concise summaries, capturing the essence of the conflict, key arguments, socio-technical controversies, and the relationship between various territorial projects discussed between 2012 and 2017. Include significant events, actors involved, and their impacts on the territorial planning policy. Example: "During the development of the Provincial Territorial Planning Plan from 2012 to 2017, socio-technical controversies emerged, notably around water resource management, involving diverse actors like local communities, agricultural sectors, and mining companies, leading to a reevaluation of the territory's development model by 2017."
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

# ChatPromptTemplate(input_variables=['all_text_data'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['all_text_data'], template='The following is a set of documents:\n{all_text_data}\nBased on this list of docs, please identify the main themes \nHelpful Answer:'))])

# Run chain
reduce_chain = LLMChain(llm=model, prompt=reduce_prompt)

combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain,
    document_variable_name="all_text_data"  # This should match the variable name in reduce_prompt
)

# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="all_text_data",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=0
)
split_docs = text_splitter.split_documents(all_text_data)

# Run the MapReduce Chain
summarization_results = map_reduce_chain.run(split_docs)

In [None]:
with open('summary.txt', 'w') as file:
    file.write(str(summarization_results))

In [None]:
# traditional spacy NER (Named Recognition Library)
def split_document_sent(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents] # referencial

# spacy-llm relationship extraction
def process_text(nlp, text, verbose=False):
    doc = nlp(text)
    if verbose:
        msg.text(f"Text: {doc.text}")
        msg.text(f"Entities: {[(ent.text, ent.label_) for ent in doc.ents]}")
        msg.text("Relations:")
        for r in doc._.rel:
            msg.text(f"  - {doc.ents[r.dep]} [{r.relation}] {doc.ents[r.dest]}")
    return doc

def run_pipeline(config_path, examples_path=None, verbose=False):
    if not os.getenv("OPENAI_API_KEY"):
        msg.fail("OPENAI_API_KEY env variable was not found. Set it and try again.", exits=1)

    nlp = assemble(config_path, overrides={} if examples_path is None else {"paths.examples": str(examples_path)})

    # Initialize counters and storage
    processed_data = []
    entity_counts = Counter()
    relation_counts = Counter()

    # Load your articles and news data here
    # all_data = news_articles_data

    sents = split_document_sent(summarization_results)
    for sent in sents:
        doc = process_text(nlp, sent, verbose)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        relations = [(doc.ents[r.dep].text, r.relation, doc.ents[r.dest].text) for r in doc._.rel]

        # Store processed data
        processed_data.append({'text': doc.text, 'entities': entities, 'relations': relations})

        # Update counters
        entity_counts.update([ent[1] for ent in entities])
        relation_counts.update([rel[1] for rel in relations])

    # Export to JSON
    with open('processed_data.json', 'w') as f:
        json.dump(processed_data, f)

    # Display summary
    msg.text(f"Entity counts: {entity_counts}")
    msg.text(f"Relation counts: {relation_counts}")

# Set your configuration paths and flags
config_path = Path("zeroshot.cfg")
examples_path = None  # or None if not using few-shot
verbose = True

# Run the pipeline
file = run_pipeline(config_path, None, verbose)