In [None]:
import gdown
import re
import PyPDF2
import os
import mysql.connector

# Download file from Google Drive
def download_file(work_folder, actual_download_link_url):
    save_file_path = os.path.join(work_folder, "Chemistry Questions.pdf")
    
    try:
        gdown.download(actual_download_link_url, save_file_path, quiet=False)
        print(f"File downloaded successfully at path :{save_file_path}")
    except Exception as e:
        if not os.path.exists(work_folder):
            os.makedirs(work_folder)
        gdown.download(actual_download_link_url, save_file_path, quiet=True)
        print(f"File downloaded successfully at path :{save_file_path}")

# Connect to MySQL Database
def connect_to_mysql_database():
    try:
        connection = mysql.connector.connect(
            host="localhost", 
            user="priya", 
            password="priya", 
            database="mydatabase"
        )
        if connection.is_connected():
            print("Connection to MySQL DB is successful!")
            return connection
    except Exception as e:
        print("Error while connecting to MySQL DB:", e)
        return None

# Check if DB exists or not
def create_db_if_not_exists(connection):
    try:
        cursor = connection.cursor()
        cursor.execute("CREATE DATABASE IF NOT EXISTS mydatabase;")
    except Exception as e:
        print("Error while creating DB:", e)
        return None

# Create table if it doesn't exist
def create_table_if_not_exists(connection):
    cursor = connection.cursor()
    create_table_query = """
    CREATE TABLE IF NOT EXISTS QUESTIONS (
        id INT AUTO_INCREMENT PRIMARY KEY,
        subject_name VARCHAR(100),
        question_text TEXT,
        answer_options TEXT,
        correct_answer VARCHAR(10),
        chapter_name VARCHAR(100)
    );
    """
    cursor.execute(create_table_query)
    connection.commit()

# Extract questions and answers from PDF
def extract_questions_from_pdf(pdf_path):
    questions = []
    current_chapter = None

    try:
        pdffile_handler = open(pdf_path, "rb")
        pdf_reader_data = PyPDF2.PdfReader(pdffile_handler)
        
        text = ""
        for page_index in range(len(pdf_reader_data.pages)):
            text += pdf_reader_data.pages[page_index].extract_text()

        # Check if text was extracted
        #print("\nExtracted text:\n", text[:500])  # Print the first 500 characters to inspect

        # If text is empty or looks wrong, that’s likely the issue
        if not text:
            print("No text extracted from the PDF.")
            return []

        # Regular Expression to capture Chapter Names
        chapter_pattern = re.compile(r"Chapter \d+:.*[^\S\r\n]", re.DOTALL)
        chapters = chapter_pattern.findall(text)
        #print("\nFound chapters:", chapters)  # Print found chapters to inspect

        for chapter in chapters:
            current_chapter = chapter.split("\n")[0]
            chapter_name = current_chapter.split(":")[1].strip()
            print("\nCurrent Chapter:", chapter_name)

            # Regular Expression to capture questions and options within a chapter
            question_pattern = re.compile(
                r"(\d+\.)\s*([^\n]+?)\s*(A\))\s*([^\n]+?)\s*(B\))\s*([^\n]+?)\s*(C\))\s*([^\n]+?)\s*(D\))\s*([^\n]+?)\s*Answer:\s*([A-D])\)",
                re.DOTALL
            )
            matches = question_pattern.findall(text)
            #print("\nFound matches:", matches)  # Print matches to inspect

            for match in matches:
                print(f"\nmatch : {match}")
                question_text = match[1].strip()
                options = ",".join(match[2:len(match)])  # Correct the way options are handled
                print(f"\noptions : {options}")
                correct_answer = match[len(match)-1].strip()

                subject_name = "Chemistry"
                temp_questions = {
                    'subject_name': subject_name,
                    'question_text': question_text,
                    'answer_options': options,
                    'correct_answer': correct_answer,
                    'chapter_name': chapter_name
                }
                questions.append(temp_questions)

    except Exception as e:
        print(f"Error while extracting questions: {e}")
        if not os.path.exists(pdf_path):
            print(f"File is not found in path: {pdf_path}")

    print(f"Length of questions: {len(questions)}")  # Print length of extracted questions
    return questions

# Insert questions into the database
def insert_questions_into_db(connection, questions):
    cursor = connection.cursor()
    insert_query = """
    INSERT INTO QUESTIONS (subject_name, question_text, answer_options, correct_answer, chapter_name)
    VALUES (%s, %s, %s, %s, %s);
    """
    
    try:
        for question in questions:
            print(f"Inserting question: {question['question_text']}")
            cursor.execute(insert_query, (
                question['subject_name'],
                question['question_text'],
                question['answer_options'],
                question['correct_answer'],
                question['chapter_name']
            ))
        connection.commit()
        print(f"{len(questions)} questions inserted successfully.")
    except Exception as e:
        print("Error inserting data into database:", e)

def main():
    # Connect to MySQL Database
    connection = connect_to_mysql_database()
    
    if connection:
        create_db_if_not_exists(connection)
        # Create table if it doesn't exist
        create_table_if_not_exists(connection)

    # Extract questions from the PDF
    # 1.Store a PDF file in a folder called “/content” 
    download_link_url = "https://drive.google.com/file/d/1OVFr62PNO5-PcnWwFO1Frp0K_0UX6ENc/view?usp=drive_link"
    google_download_file_id = download_link_url.split("/")[5]
    # Corrected URL format for downloading the file
    actual_download_link_url = f'https://drive.google.com/uc?id={google_download_file_id}'
    work_folder = "./content"
    
    # Uncomment to download the file
    # download_file(work_folder, actual_download_link_url)

    pdf_path = os.path.join(work_folder, "Chemistry Questions.pdf")
    questions = extract_questions_from_pdf(pdf_path)

    if questions:
        # Insert extracted questions into the database
        insert_questions_into_db(connection, questions)

    # Close the database connection
    connection.close()

if __name__ == "__main__":
    main()


In [None]:
import gdown
import re
import PyPDF2
import os
import mysql.connector

# Download file from Google Drive
def download_file(work_folder, actual_download_link_url):
    save_file_path = os.path.join(work_folder, "Chemistry Questions.pdf")
    
    try:
        gdown.download(actual_download_link_url, save_file_path, quiet=False)
        print(f"File downloaded successfully at path :{save_file_path}")
    except Exception as e:
        if not os.path.exists(work_folder):
            os.makedirs(work_folder)
        gdown.download(actual_download_link_url, save_file_path, quiet=True)
        print(f"File downloaded successfully at path :{save_file_path}")

# Connect to MySQL Database
def connect_to_mysql_database():
    try:
        connection = mysql.connector.connect(
            host="localhost", 
            user="priya", 
            password="priya", 
            database="mydatabase"
        )
        if connection.is_connected():
            print("Connection to MySQL DB is successful!")
            return connection
    except Exception as e:
        print("Error while connecting to MySQL DB:", e)
        return None

# Check if DB exists or not
def create_db_if_not_exists(connection):
    try:
        cursor = connection.cursor()
        cursor.execute("CREATE DATABASE IF NOT EXISTS mydatabase;")
    except Exception as e:
        print("Error while creating DB:", e)
        return None

# Create table if it doesn't exist
def create_table_if_not_exists(connection):
    cursor = connection.cursor()
    create_table_query = """
    CREATE TABLE IF NOT EXISTS QUESTIONS (
        id INT AUTO_INCREMENT PRIMARY KEY,
        subject_name VARCHAR(100),
        question_text TEXT,
        answer_options TEXT,
        correct_answer VARCHAR(10),
        chapter_name VARCHAR(100)
    );
    """
    cursor.execute(create_table_query)
    connection.commit()

# Extract questions and answers from PDF
def extract_questions_from_pdf(pdf_path):
    questions = []
    current_chapter = None

    try:
        pdffile_handler = open(pdf_path, "rb")
        pdf_reader_data = PyPDF2.PdfReader(pdffile_handler)
        
        text = ""
        for page_index in range(len(pdf_reader_data.pages)):
            text += pdf_reader_data.pages[page_index].extract_text()

        # Check if text was extracted
        #print("\nExtracted text:\n", text[:500])  # Print the first 500 characters to inspect

        # If text is empty or looks wrong, that’s likely the issue
        if not text:
            print("No text extracted from the PDF.")
            return []

        # Regular Expression to capture Chapter Names
        chapter_pattern = re.compile(r"Chapter \d+:.*[^\S\r\n]", re.DOTALL)
        chapters = chapter_pattern.findall(text)
        #print("\nFound chapters:", chapters)  # Print found chapters to inspect

        for chapter in chapters:
            current_chapter = chapter.split("\n")[0]
            chapter_name = current_chapter.split(":")[1].strip()
            print("\nCurrent Chapter:", chapter_name)

            # Regular Expression to capture questions and options within a chapter
            question_pattern = re.compile(
                r"(\d+\.)\s*([^\n]+?)\s*(A\))\s*([^\n]+?)\s*(B\))\s*([^\n]+?)\s*(C\))\s*([^\n]+?)\s*(D\))\s*([^\n]+?)\s*Answer:\s*([A-D])\)",
                re.DOTALL
            )
            matches = question_pattern.findall(text)
            #print("\nFound matches:", matches)  # Print matches to inspect

            for match in matches:
                print(f"\nmatch : {match}")
                question_text = match[1].strip()
                options = ",".join(match[2:len(match)])  # Correct the way options are handled
                print(f"\noptions : {options}")
                correct_answer = match[len(match)-1].strip()

                subject_name = "Chemistry"
                temp_questions = {
                    'subject_name': subject_name,
                    'question_text': question_text,
                    'answer_options': options,
                    'correct_answer': correct_answer,
                    'chapter_name': chapter_name
                }
                questions.append(temp_questions)

    except Exception as e:
        print(f"Error while extracting questions: {e}")
        if not os.path.exists(pdf_path):
            print(f"File is not found in path: {pdf_path}")

    print(f"Length of questions: {len(questions)}")  # Print length of extracted questions
    return questions

# Insert questions into the database
def insert_questions_into_db(connection, questions):
    cursor = connection.cursor()
    insert_query = """
    INSERT INTO QUESTIONS (subject_name, question_text, answer_options, correct_answer, chapter_name)
    VALUES (%s, %s, %s, %s, %s);
    """
    
    try:
        for question in questions:
            print(f"Inserting question: {question['question_text']}")
            cursor.execute(insert_query, (
                question['subject_name'],
                question['question_text'],
                question['answer_options'],
                question['correct_answer'],
                question['chapter_name']
            ))
        connection.commit()
        print(f"{len(questions)} questions inserted successfully.")
    except Exception as e:
        print("Error inserting data into database:", e)

def main():
    # Connect to MySQL Database
    connection = connect_to_mysql_database()
    
    if connection:
        create_db_if_not_exists(connection)
        # Create table if it doesn't exist
        create_table_if_not_exists(connection)

    # Extract questions from the PDF
    # 1.Store a PDF file in a folder called “/content” 
    download_link_url = "https://drive.google.com/file/d/1OVFr62PNO5-PcnWwFO1Frp0K_0UX6ENc/view?usp=drive_link"
    google_download_file_id = download_link_url.split("/")[5]
    # Corrected URL format for downloading the file
    actual_download_link_url = f'https://drive.google.com/uc?id={google_download_file_id}'
    work_folder = "./content"
    
    # Uncomment to download the file
    # download_file(work_folder, actual_download_link_url)

    pdf_path = os.path.join(work_folder, "Chemistry Questions.pdf")
    questions = extract_questions_from_pdf(pdf_path)

    if questions:
        # Insert extracted questions into the database
        insert_questions_into_db(connection, questions)

    # Close the database connection
    connection.close()

if __name__ == "__main__":
    main()
