In [7]:
import re
import os
import fitz
from docx import Document


def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text

def read_file(doc_path):
    ext = os.path.splitext(doc_path)[1].lower()

    if ext == ".txt":
        with open(doc_path, "r", encoding="utf-8") as f:
            return f.read()
        

    elif ext == ".pdf":
        text = ""
        with fitz.open(doc_path) as doc:
            for page in doc:
                text += page.get_text()

        return text
    
    elif ext == ".docx":
        doc = Document(doc_path)
        return "\n".join([para.text for para in doc.paragraphs])
    

    else:
        raise ValueError(f"Unsupported file type: {ext}")
    

doc_path = "../data/sample_pdf.pdf"

raw_text = read_file(doc_path)

new_text_clean = clean_text(raw_text)
new_text_clean[:100]

'over the past year multinational corporations have undergone major \nstrategic transformations to ada'

In [8]:
import joblib
import os
from IPython.display import FileLink
vectorizer = joblib.load("tfidf_vectorizer.joblib")
vec_new_text = vectorizer.transform([new_text_clean])

models ={
    "Logistic Regression": joblib.load('../models/logistic_regression.pkl'),
    "SVM": joblib.load('../models/svm.pkl'),
    "Random Forest": joblib.load('../models/random_forest.pkl')
}
results = {}

for name,model in models.items():
    pred = model.predict(vec_new_text)[0]
    prob = max(model.predict_proba(vec_new_text)[0])
    results[name] = (pred, prob)


chosen_model = "SVM"
predicted_label = results[chosen_model][0]
probability = results[chosen_model][1]

print(f"📄 Document: {doc_path}")
print(f"🧠 Predicted Label ({chosen_model}): {predicted_label} (Confidence: {probability:.2f})")

output_path = '../reports/classification_results.csv'
os.makedirs('../reports', exist_ok=True)

with open(output_path, 'w') as f:
    f.write("Model,Predicted Label,Confidence\n")
    for name, (label, prob) in results.items():
        f.write(f"{name},{label},{prob:.4f}\n")

FileLink(output_path)



📄 Document: ../data/sample_pdf.pdf
🧠 Predicted Label (SVM): business (Confidence: 0.88)


In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate


# 🔑 Get API key (only first time)
os.environ["OPENAI_API_KEY"] = "YOUR TOGETHER API KEY"
os.environ["OPENAI_API_BASE"] = "https://api.together.xyz/v1"

# Setup LangChain LLM (using LLaMA 3 model on Together)
llm = ChatOpenAI(
    model="meta-llama/Llama-3-8b-chat-hf",  
    temperature=0.7,
    openai_api_key=os.environ["OPENAI_API_KEY"],
    openai_api_base=os.environ["OPENAI_API_BASE"]
)
# Create prompt
prompt = PromptTemplate.from_template("""
You are a document analysis assistant.

Document:
{text}

Predicted Category: {category}

1. Generate a suitable, engaging title for this document.
2. Explain in 1-2 sentences **why this is a good title**.
3. Explain in 1-2 sentences **why this document fits the category "{category}"**.
""")

# Generate response
response = llm.invoke(prompt.format(text=new_text_clean[:2000], category=predicted_label))  # limit if very long

print("🎯 LangChain Output:\n")
print(response.content)


🎯 LangChain Output:

**Title:** "Adapting to the New Normal: The Evolution of Business Strategy in a Post-Pandemic World"

**Why this is a good title:** This title effectively captures the essence of the document, highlighting the theme of adaptation and transformation in the business world. It's also engaging and attention-grabbing, making it suitable for a business audience.

**Why this document fits the category "business":** This document is categorized as business because it discusses the strategic transformations and trends in the global business landscape, including topics such as supply chain disruptions, digital transformation, and sustainable practices. The language used is technical and industry-specific, indicating that the document is intended for a professional or academic audience within the business sector.
