<a href="https://colab.research.google.com/github/tirandagan/colabs/blob/main/AI_Lab_3_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Downloading a file

This code will download a file into the sample_data folder


In [None]:
import os
import requests
from urllib.parse import urlparse

def download_file(url, folder="sample_data"):
    # Create the folder if it doesn't exist
    os.makedirs(folder, exist_ok=True)

    # Get the filename from the URL
    filename = os.path.basename(urlparse(url).path)

    # Full path for the file
    file_path = os.path.join(folder, filename)

    # Download the file
    response = requests.get(url, stream=True)
    response.raise_for_status()  # Raise an exception for bad status codes

    # Write the file
    with open(file_path, 'wb') as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)

    print(f"File downloaded successfully: {file_path}")

# Example usage
if __name__ == "__main__":
    file_url = "https://watertechcorp.com/wp-content/uploads/pdf/10000AB.OP21.7L.pdf"
    download_file(file_url)

File downloaded successfully: sample_data2/10000AB.OP21.7L.pdf


# Importing a PDF
1.	Import a pdf using pdfplumber using given code


In [6]:
pip install pdfplumber



In [None]:
import pdfplumber as pp

with pp.open('sample_data/10000AB.OP21.7L.pdf') as book:
    for page_no, page in enumerate(book.pages, start=1):
        print(f'{page_no = }')
        data = page.extract_text()
        print(data.strip())
        print('-'*45)

# Language Detection
Detecting language using langdetect

In [8]:
pip install langdetect



In [9]:
from langdetect import detect

def detect_language(text):
    try:
        return detect(text)
    except:
        return "Unable to detect language"

# Example usage
text1 = "1. Coperchio testa 4. Compartimento batteria"
text2 = "Non gettare nei cassonetti dei rifiuti indifferenziati. Per gli elementi contenenti batterie rimovibili, rimuovere le batterie"
text3 = "1. Lesen und verstehen Sie alle Sicherheitswarnungen vor dem Betrieb. Andernfalls kann es zu schweren PersonenschÃ¤den"

print(detect_language(text1))
print(detect_language(text2))
print(detect_language(text3))

it
it
de


3.	Combine both #1 and #2 to detect the language for each page read, and only print out english language text


In [None]:
import pdfplumber as pp
from langdetect import detect

def detect_language(text):
    try:
        return detect(text)
    except:
        return "Unable to detect language"
with pp.open('sample_data/10000AB.OP21.7L.pdf') as book:
    for page_no, page in enumerate(book.pages, start=1):
        print(f'{page_no = }')
        data = page.extract_text()
        lang=detect_language(data.strip())
        if lang=="en":
            print(f"${data} * ${lang}")
            print('-'*45)
        else:
            print (f"ignoring [${lang}]")

Notice the problem? We have lines with mixed languages. Let's prompt LLM to modfy our code:



> This code provide me with output which only include english language text extracted from a page. There are pages that have mixed sentances. Please modify the code so it parses each line separately for language detection and then combines them into a single string represnting that page.



```
import pdfplumber as pp
from langdetect import detect

def detect_language(text):
    try:
        return detect(text)
    except:
        return "Unable to detect language"
with pp.open('test_data/10000AB.OP21.7L.pdf') as book:
    for page_no, page in enumerate(book.pages, start=1):
        print(f'{page_no = }')
        data = page.extract_text()
        lang=detect_language(data.strip())
        if lang=="en":
            print(f"${data} * ${lang}")
            print('-'*45)
        else:
            print (f"ignoring [${lang}]")
```





Here's the code that Claude/ChatGPT gave me:


In [None]:
import pdfplumber as pp
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

def process_page(page):
    data = page.extract_text()
    lines = data.split('\n')
    processed_lines = []

    for line in lines:
        line = line.strip()
        if line:
            lang = detect_language(line)
            processed_lines.append(f"[{lang}] {line}")

    return '\n'.join(processed_lines)

with pp.open('sample_data/10000AB.OP21.7L.pdf') as book:
    for page_no, page in enumerate(book.pages, start=1):
        print(f'Page {page_no}:')
        processed_content = process_page(page)
        print(processed_content)
        print('-' * 45)

Now ask LLM to modify your code so it will accept the filename as an argument, and another argument specifying which language (codes) you would to produce as an output, for example:


```
python sample_data/10000AB.OP21.7L.pdf -l en,de
```

This should ingest the pdf file, and only output lines that are in English or German (de)

In [15]:
import pdfplumber as pp
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
import argparse

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

def process_page(page, target_languages):
    data = page.extract_text()
    lines = data.split('\n')
    processed_lines = []

    for line in lines:
        line = line.strip()
        if line:
            lang = detect_language(line)
            if not target_languages or lang in target_languages:
                processed_lines.append(f"[{lang}] {line}")

    return '\n'.join(processed_lines)

def main():
    parser = argparse.ArgumentParser(description="Process a PDF file and filter content by language.")
    parser.add_argument("pdf_file", help="Path to the PDF file")
    parser.add_argument("-l", "--languages", help="Comma-separated list of language codes to include (e.g., 'en,fr,de')")
    args = parser.parse_args()

    target_languages = args.languages.split(',') if args.languages else []

    with pp.open(args.pdf_file) as book:
        for page_no, page in enumerate(book.pages, start=1):
            print(f'Page {page_no}:')
            processed_content = process_page(page, target_languages)
            if processed_content:
                print(processed_content)
                print('-' * 45)
            else:
                print("No content in specified language(s) on this page.")
                print('-' * 45)

if __name__ == "__main__":
    main()

usage: colab_kernel_launcher.py [-h] [-l LANGUAGES] pdf_file
colab_kernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
