In [1]:
pip install pandas openpyxl pdfplumber pytesseract pillow lxml




In [18]:
import os
import pandas as pd
import json
import xml.etree.ElementTree as ET
from google.colab import files
from PIL import Image
import pytesseract
import pdfplumber
import re
import nltk
from nltk.tokenize import word_tokenize
import cv2

In [19]:
# Download required NLTK resources
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [66]:
from google.colab import files
import os

def upload_and_assign_path(num_files):
    uploaded_files = []

    # Upload files based on user input
    for i in range(num_files):
        print(f"Uploading file {i + 1} of {num_files}...")
        uploaded = files.upload()

        # Ensure that a file is uploaded
        if uploaded:
            # Get the uploaded file name (this will be used for the file path)
            file_name = list(uploaded.keys())[0]
            # Define file path (The file is uploaded to the /content directory in Colab)
            file_path = '/content/' + file_name
            uploaded_files.append(file_path)
        else:
            print("No file uploaded. Please try again.")
            return None

    # Return list of file paths
    return uploaded_files

# Ask the user how many files they want to upload
num_files = int(input("How many files do you want to upload? "))

# Example usage
file_paths = upload_and_assign_path(num_files)
if file_paths:
    print(f"Files uploaded successfully. Paths: {file_paths}")
else:
    print("No files uploaded.")


How many files do you want to upload? 2
Uploading file 1 of 2...


Saving Anthropology.pdf to Anthropology (6).pdf
Uploading file 2 of 2...


Saving dataset2.png to dataset2 (4).png
Files uploaded successfully. Paths: ['/content/Anthropology (6).pdf', '/content/dataset2 (4).png']


In [68]:
# Function to extract text from a PDF
def extract_text_from_pdf(file_path):
    with pdfplumber.open(file_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
        return text

# Function to extract text from an image using OCR
def extract_text_from_image(file_path):
    image = Image.open(file_path)
    text = pytesseract.image_to_string(image)
    return text

# Function to clean extracted text
def clean_text(text):
    # Remove unwanted line breaks and extra spaces
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()

    # Remove non-alphanumeric characters (optional)
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)

    return text

# Function to tokenize text
def tokenize_text(text):
    # Tokenize the cleaned text
    tokens = word_tokenize(text)
    return tokens

# Function to preprocess image before OCR
def preprocess_image(file_path):
    # Open the image using OpenCV
    image = cv2.imread(file_path)

    # Convert to grayscale (important for OCR)
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply thresholding (optional: can help make the text clearer)
    _, binary_image = cv2.threshold(gray_image, 150, 255, cv2.THRESH_BINARY)

    return binary_image

# Function to process PDF file: extract, clean, and tokenize text
def process_pdf(file_path):
    raw_text = extract_text_from_pdf(file_path)
    cleaned_text = clean_text(raw_text)
    tokens = tokenize_text(cleaned_text)
    return tokens

# Function to process Image file: preprocess, extract, clean, and tokenize text
def process_image(file_path):
    preprocessed_image = preprocess_image(file_path)
    raw_text = pytesseract.image_to_string(preprocessed_image)
    cleaned_text = clean_text(raw_text)
    tokens = tokenize_text(cleaned_text)
    return tokens

In [69]:
def load_file(file_path):
    """
    Load file based on extension and return a pandas DataFrame or raw text (for non-tabular formats).
    """
    file_extension = os.path.splitext(file_path)[1].lower()

    if file_extension == '.csv':
        return pd.read_csv(file_path)
    elif file_extension == '.json':
        return pd.read_json(file_path)
    elif file_extension in ['.xls', '.xlsx']:
        return pd.read_excel(file_path)
    elif file_extension == '.pdf':
        return extract_text_from_pdf(file_path)
    elif file_extension in ['.png', '.jpg', '.jpeg']:
        return extract_text_from_image(file_path)
    else:
        raise ValueError("Unsupported file format. Please upload CSV, JSON, Excel, PDF, or image files.")


In [70]:
# Function to save processed data into a variety of formats
def save_as_csv(data, output_path):
    """
    Save data as CSV. If it's raw text, create a simple structure.
    """
    if isinstance(data, list):
        # Convert list of tokens to a DataFrame for CSV
        data_df = pd.DataFrame(data, columns=["Tokens"])
        data_df.to_csv(output_path, index=False)
    else:
        with open(output_path, 'w') as f:
            f.write(data)  # Writing raw text data for non-tabular formats
    print(f"CSV file saved at {output_path}")

In [71]:
def save_as_json(data, output_path):
    """
    Save data as JSON.
    """
    if isinstance(data, pd.DataFrame):
        data.to_json(output_path, orient='records', lines=True)
    else:
        # Save the raw text in a JSON format
        with open(output_path, 'w') as f:
            json.dump({"text": data}, f)
    print(f"JSON file saved at {output_path}")


In [80]:
def save_as_excel(data, output_path):
    """
    Save data as Excel.
    """
    if isinstance(data, pd.DataFrame):
        data.to_excel(output_path, index=False, engine='openpyxl')
    else:
        # If raw text, save as a single cell sheet
        # Clean the text data before saving to Excel to remove problematic characters
        cleaned_data = data.replace('\x0c', '') # Removing form feed character
        df = pd.DataFrame({"Text": [cleaned_data]})
        df.to_excel(output_path, index=False, engine='openpyxl')
    print(f"Excel file saved at {output_path}")

In [73]:
def save_as_xml(data, output_path):
    """
    Save data as XML.
    """
    root = ET.Element("root")
    if isinstance(data, pd.DataFrame):
        for _, row in data.iterrows():
            entry = ET.SubElement(root, "entry")
            for col in data.columns:
                child = ET.SubElement(entry, col)
                child.text = str(row[col])
    else:
        entry = ET.SubElement(root, "entry")
        child = ET.SubElement(entry, "text")
        child.text = data

    tree = ET.ElementTree(root)
    tree.write(output_path)
    print(f"XML file saved at {output_path}")


In [74]:
# Function to convert file and save the output
def convert_file(file_path, output_dir):
    """
    Convert the file to multiple formats (CSV, JSON, Excel, XML) and save them in the output directory.
    """
    # Load and process the file into tokens (list of words)
    #tokens = load_and_process_file(file_path)
    data = load_file(file_path)  # This line loads the data

    # Get the base name (without extension) for output file naming
    base_name = os.path.splitext(os.path.basename(file_path))[0]

    # Define output paths
    csv_output = os.path.join(output_dir, f"{base_name}_output.csv")
    json_output = os.path.join(output_dir, f"{base_name}_output.json")
    excel_output = os.path.join(output_dir, f"{base_name}_output.xlsx")
    xml_output = os.path.join(output_dir, f"{base_name}_output.xml")

    # Save the processed data in different formats, using 'data' instead of 'tokens'
    save_as_csv(data, csv_output)
    save_as_json(data, json_output)
    save_as_excel(data, excel_output)
    save_as_xml(data, xml_output)

In [75]:
# Main Code Execution
output_dir = '/content/output_files3'  # Directory to save the output files

In [76]:
# Make sure output directory exists
os.makedirs(output_dir, exist_ok=True)



In [78]:
!apt install tesseract-ocr
!pip install pytesseract

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 29 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 1s (4,987 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 126209 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-

In [81]:
# Step 3: Convert and save the datasets in different formats
if file_paths:
    for file_path in file_paths:
        convert_file(file_path, output_dir)
else:
    print("No files uploaded.")


CSV file saved at /content/output_files3/Anthropology (6)_output.csv
JSON file saved at /content/output_files3/Anthropology (6)_output.json
Excel file saved at /content/output_files3/Anthropology (6)_output.xlsx
XML file saved at /content/output_files3/Anthropology (6)_output.xml
CSV file saved at /content/output_files3/dataset2 (4)_output.csv
JSON file saved at /content/output_files3/dataset2 (4)_output.json
Excel file saved at /content/output_files3/dataset2 (4)_output.xlsx
XML file saved at /content/output_files3/dataset2 (4)_output.xml


In [96]:
import nltk

# Download 'punkt_tab' data package
nltk.download('punkt_tab')

# Rest of your existing code follows...

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [97]:
import os
import pandas as pd
import json
import xml.etree.ElementTree as ET
import pytesseract
from PIL import Image
import pdfplumber
import nltk
import re
from nltk.tokenize import sent_tokenize

# Download the NLTK tokenizer resources (needed for sentence tokenization)
nltk.download('punkt')

# Function to extract text from a PDF
def extract_text_from_pdf(file_path):
    with pdfplumber.open(file_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
        return text

# Function to extract text from an image using OCR (improving the OCR quality)
def extract_text_from_image(file_path):
    image = Image.open(file_path)
    text = pytesseract.image_to_string(image, lang='eng')  # Using English OCR language
    # Clean up OCR text (remove extra spaces, newlines, etc.)
    text = ' '.join(text.split())  # Remove excessive whitespace and newlines
    return text

# Function to load file based on extension
def load_file(file_path):
    """
    Load file based on extension and return raw text (for non-tabular formats).
    """
    file_extension = os.path.splitext(file_path)[1].lower()

    if file_extension == '.csv':
        return pd.read_csv(file_path)
    elif file_extension == '.json':
        return pd.read_json(file_path)
    elif file_extension in ['.xls', '.xlsx']:
        return pd.read_excel(file_path)
    elif file_extension == '.pdf':
        return extract_text_from_pdf(file_path)
    elif file_extension in ['.png', '.jpg', '.jpeg']:
        return extract_text_from_image(file_path)
    else:
        raise ValueError("Unsupported file format. Please upload CSV, JSON, Excel, PDF, or image files.")

# Ensure nltk punkt is downloaded for tokenization
nltk.download('punkt')  # This ensures the 'punkt' tokenizer models are available

# Function to extract QA pairs from the raw text
def extract_qa_pairs(text):
    """
    Extracts question-answer pairs from the raw text.
    Assumes questions end with '?' and answers follow them.
    """
    qa_pairs = []

    # Tokenizing text using NLTK's sentence tokenizer for more accurate sentence splitting
    sentences = sent_tokenize(text)

    # Now extracting QA pairs by identifying questions and their subsequent answers
    for i in range(len(sentences) - 1):
        question = sentences[i].strip()
        if question.endswith('?'):  # Identifying the question
            answer = sentences[i + 1].strip()  # The next sentence as the answer
            qa_pairs.append((question, answer))

    return qa_pairs

# Function to extract and display QA pairs
def extract_qa_from_data(data):
    """
    Extracts QA pairs from the input data (raw text or DataFrame) and prints them.
    """
    if isinstance(data, str):  # If raw text is provided (e.g., from PDF or image)
        qa_pairs = extract_qa_pairs(data)

        # Printing the extracted QA pairs
        if qa_pairs:
            print("\nExtracted QA Pairs:")
            for i, (question, answer) in enumerate(qa_pairs, 1):
                print(f"Q{i}: {question}")
                print(f"A{i}: {answer}")
                print()
        else:
            print("No QA pairs found in the provided text.")
    elif isinstance(data, pd.DataFrame):  # If the data is a DataFrame (e.g., CSV or Excel)
        # Convert DataFrame to a string and then extract QA pairs
        text_data = data.to_string(index=False)
        qa_pairs = extract_qa_pairs(text_data)

        # Printing the extracted QA pairs
        if qa_pairs:
            print("\nExtracted QA Pairs from DataFrame:")
            for i, (question, answer) in enumerate(qa_pairs, 1):
                print(f"Q{i}: {question}")
                print(f"A{i}: {answer}")
                print()
        else:
            print("No QA pairs found in the DataFrame.")

# Function to save processed data into a variety of formats
def save_as_csv(data, output_path):
    """
    Save data as CSV. If it's raw text, create a simple structure.
    """
    if isinstance(data, pd.DataFrame):
        data.to_csv(output_path, index=False)
    else:
        with open(output_path, 'w') as f:
            f.write(data)  # Writing raw text data for non-tabular formats
    print(f"CSV file saved at {output_path}")

def save_as_json(data, output_path):
    """
    Save data as JSON.
    """
    if isinstance(data, pd.DataFrame):
        data.to_json(output_path, orient='records', lines=True)
    else:
        # Save the raw text in a JSON format
        with open(output_path, 'w') as f:
            json.dump({"text": data}, f)
    print(f"JSON file saved at {output_path}")

def save_as_excel(data, output_path):
    """
    Save data as Excel.
    """
    if isinstance(data, pd.DataFrame):
        data.to_excel(output_path, index=False, engine='openpyxl')
    else:
        # If raw text, save as a single cell sheet
        df = pd.DataFrame({"Text": [data]})
        df.to_excel(output_path, index=False, engine='openpyxl')
    print(f"Excel file saved at {output_path}")

def save_as_xml(data, output_path):
    """
    Save data as XML.
    """
    root = ET.Element("root")
    if isinstance(data, pd.DataFrame):
        for _, row in data.iterrows():
            entry = ET.SubElement(root, "entry")
            for col in data.columns:
                child = ET.SubElement(entry, col)
                child.text = str(row[col])
    else:
        entry = ET.SubElement(root, "entry")
        child = ET.SubElement(entry, "text")
        child.text = data

    tree = ET.ElementTree(root)
    tree.write(output_path)
    print(f"XML file saved at {output_path}")

# Example usage:
file_paths = ['/content/Anthropology (3).pdf', '/content/dataset2 (1).png']  # Example paths (update this as needed)
output_dir = '/content/output_files77'  # Directory to save the output files

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Process each file
for file_path in file_paths:
    data = load_file(file_path)

    # Extract and display QA pairs from the data
    extract_qa_from_data(data)

    # Save the data in different formats (CSV, JSON, Excel, XML)
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    save_as_csv(data, os.path.join(output_dir, f"{base_name}_output.csv"))
    save_as_json(data, os.path.join(output_dir, f"{base_name}_output.json"))
    save_as_excel(data, os.path.join(output_dir, f"{base_name}_output.xlsx"))
    save_as_xml(data, os.path.join(output_dir, f"{base_name}_output.xml"))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Extracted QA Pairs:
Q1: In this immensely readable book, Peter Metcalf makes large and
complex topics both accessible and enjoyable, arguing that the issues
anthropology deals with are all around us, in magazines and newspa-
pers and on television.He tackles questions such as:
• What is anthropology?
A1: • How can we distinguish cultural differences from physical ones?

Q2: • How can we distinguish cultural differences from physical ones?
A2: • What is culture,anyway?

Q3: • What is culture,anyway?
A3: • How do anthropologists study culture?

Q4: • How do anthropologists study culture?
A4: • What are the key theories and approaches used today?

Q5: • What are the key theories and approaches used today?
A5: • How has the discipline changed over time?

Q6: • How has the discipline changed over time?
A6: This volume provides students with an overview of the fundamental
principles of anthropology, and an accessible guide for anyone just
wanting to learn more about a fascinating subject.

