In [36]:
import gdown
import re
import PyPDF2
import os
import mysql.connector

# Download file from Google Drive
def download_file(work_folder, actual_download_link_url):
    save_file_path = os.path.join(work_folder, "Chemistry Questions.pdf")
    
    try:
        gdown.download(actual_download_link_url, save_file_path, quiet=False)
        print(f"File downloaded successfully at path :{save_file_path}")
    except Exception as e:
        if not os.path.exists(work_folder):
            os.makedirs(work_folder)
        gdown.download(actual_download_link_url, save_file_path, quiet=True)
        print(f"File downloaded successfully at path :{save_file_path}")

# Connect to MySQL Database
def connect_to_mysql_database():
    try:
        connection = mysql.connector.connect(
            host="localhost", 
            user="priya", 
            password="priya", 
            database="mydatabase"
        )
        if connection.is_connected():
            print("Connection to MySQL DB is successful!")
            return connection
    except Exception as e:
        print("Error while connecting to MySQL DB:", e)
        return None

# Check if DB exists or not
def create_db_if_not_exists(connection):
    try:
        cursor = connection.cursor()
        cursor.execute("CREATE DATABASE IF NOT EXISTS mydatabase;")
    except Exception as e:
        print("Error while creating DB:", e)
        return None

# Create table if it doesn't exist
def create_table_if_not_exists(connection):
    cursor = connection.cursor()
    create_table_query = """
    CREATE TABLE IF NOT EXISTS QUESTIONS (
        id INT AUTO_INCREMENT PRIMARY KEY,
        subject_name VARCHAR(100),
        question_text TEXT,
        answer_options TEXT,
        correct_answer VARCHAR(10),
        chapter_name VARCHAR(100)
    );
    """
    cursor.execute(create_table_query)
    connection.commit()

# Extract questions and answers from PDF
def extract_questions_from_pdf(pdf_path):
    questions = []
    current_chapter = None

    try:
        pdffile_handler = open(pdf_path, "rb")
        pdf_reader_data = PyPDF2.PdfReader(pdffile_handler)
        
        text = ""
        for page_index in range(len(pdf_reader_data.pages)):
            text += pdf_reader_data.pages[page_index].extract_text()

        # Check if text was extracted
        print("\nExtracted text:\n", text[:500])  # Print the first 500 characters to inspect

        # If text is empty or looks wrong, that’s likely the issue
        if not text:
            print("No text extracted from the PDF.")
            return []

        # Regular Expression to capture Chapter Names
        chapter_pattern = re.compile(r"Chapter (\d+):\s*(.*?)(?=\s*Chapter \d+:|$)", re.DOTALL)
        chapters = chapter_pattern.findall(text)

        print("\nchapters [0]:\n", chapters[0])
        print("\nchapters [1]:\n", chapters[1])
        print("\nchapters [2]:\n", chapters[2])
        print("\nchapters [3]:\n", chapters[3])
        for chapter in chapters:
            print("\nchapter :: ", chapter)
            chapter_number, chapter_title = chapter
            print("\nchapter_title :: ", chapter_title)
            current_chapter = chapter_title.split("\n")[0].strip()
            
            # Regular Expression to capture questions and options within a chapter
            question_pattern = re.compile(r"(\d+\.)\s*([^\n]+?)\s*(A\))\s*([^\n]+?)\s*(B\))\s*([^\n]+?)\s*(C\))\s*([^\n]+?)\s*(D\))\s*([^\n]+?)\s*Answer:\s*([A-D])\)", re.DOTALL)
            matches = question_pattern.findall(chapter_title)
            print("\nmatches :: ", matches)

            index = 0
            for match in matches:
                print(f"\nmatch : {match}")
                question_text = match[1].strip()
                options = ",".join(match[2:len(match)])  # Correct the way options are handled
                print(f"\noptions : {options}")
                correct_answer = match[len(match)-1].strip()

                subject_name = "Chemistry"
                temp_questions = {
                    'subject_name': subject_name,
                    'question_text': question_text,
                    'answer_options': options,
                    'correct_answer': correct_answer,
                    'chapter_name': current_chapter
                }
                index += 1
                print(f"index :: {index}")
                questions.append(temp_questions)

    except Exception as e:
        print(f"Error while extracting questions: {e}")
        if not os.path.exists(pdf_path):
            print(f"File is not found in path: {pdf_path}")

    return questions

# Insert questions into the database
def insert_questions_into_db(connection, questions):
    cursor = connection.cursor()
    insert_query = """
    INSERT INTO QUESTIONS (subject_name, question_text, answer_options, correct_answer, chapter_name)
    VALUES (%s, %s, %s, %s, %s);
    """
    
    try:
        for question in questions:
            print(f"Inserting question: {question['question_text']}")
            cursor.execute(insert_query, (
                question['subject_name'],
                question['question_text'],
                question['answer_options'],
                question['correct_answer'],
                question['chapter_name']
            ))
        connection.commit()
        print(f"{len(questions)} questions inserted successfully.")
    except Exception as e:
        print("Error inserting data into database:", e)

def main():
    # Connect to MySQL Database
    connection = connect_to_mysql_database()
    
    if connection:
        create_db_if_not_exists(connection)
        # Create table if it doesn't exist
        create_table_if_not_exists(connection)

    # Extract questions from the PDF
    # 1.Store a PDF file in a folder called “/content” 
    download_link_url = "https://drive.google.com/file/d/1OVFr62PNO5-PcnWwFO1Frp0K_0UX6ENc/view?usp=drive_link"
    google_download_file_id = download_link_url.split("/")[5]
    # Corrected URL format for downloading the file
    actual_download_link_url = f'https://drive.google.com/uc?id={google_download_file_id}'
    print("actual_download_link_url : ",actual_download_link_url)
    work_folder = "./content"
    
    # Uncomment to download the file
    download_file(work_folder, actual_download_link_url)

    pdf_path = os.path.join(work_folder, "Chemistry Questions.pdf")
    questions = extract_questions_from_pdf(pdf_path)

    if questions:
        # Insert extracted questions into the database
        insert_questions_into_db(connection, questions)

    # Close the database connection
    connection.close()

if __name__ == "__main__":
    main()


Connection to MySQL DB is successful!
actual_download_link_url :  https://drive.google.com/uc?id=1OVFr62PNO5-PcnWwFO1Frp0K_0UX6ENc


Downloading...
From: https://drive.google.com/uc?id=1OVFr62PNO5-PcnWwFO1Frp0K_0UX6ENc
To: /Users/priprabh/Downloads/Priyanka/Learnings & Training/Python/Udemy Python/Python-Coding-Exercises/Python-Coding-Exercises/Assignment/Project5/content/Chemistry Questions.pdf
100%|████████████████████████████████████████| 279k/279k [00:00<00:00, 2.42MB/s]


File downloaded successfully at path :./content/Chemistry Questions.pdf

Extracted text:
 Chemistry  
 
Chapter 1: Basic concepts of chemistry  
 
1. What is the SI unit of mass?  
A) Gram (g)  
B) Kilogram (kg)  
C) Pound (lB)  
D) Ounce (oz)  
Answer: B) Kilogram (kg)  
2. Which of the following is an example of a chemical change?  
A) Melting of ice  
B) Cutting of paper  
C) Rusting of iron  
D) Boiling of water  
Answer: C) Rusting of iron  
3. What is Avogadro's number?  
A) 6.022×10236.022 \times 10^{23}6.022×1023  
B) 3.14×1033.14 \times 10^33.14×103  
C) 9.81×1029.81 \times 

chapters [0]:
 ('1', "Basic concepts of chemistry  \n \n1. What is the SI unit of mass?  \nA) Gram (g)  \nB) Kilogram (kg)  \nC) Pound (lB)  \nD) Ounce (oz)  \nAnswer: B) Kilogram (kg)  \n2. Which of the following is an example of a chemical change?  \nA) Melting of ice  \nB) Cutting of paper  \nC) Rusting of iron  \nD) Boiling of water  \nAnswer: C) Rusting of iron  \n3. What is Avogadro's number?  \nA) 

In [None]:
import mysql.connector

In [40]:
# Connect to MySQL Database
connection = connect_to_mysql_database()

if connection:
    create_db_if_not_exists(connection)
    # Create table if it doesn't exist
    create_table_if_not_exists(connection)

mycursor = connection.cursor()
mycursor.execute("select * from QUESTIONS where chapter_name='Structure of Atom';")
myresult = mycursor.fetchall()

myresult

Connection to MySQL DB is successful!


[(11,
  'Chemistry',
  'Who proposed the plum pudding model of the atom?',
  'A),Niels Bohr,B),J.J. Thomson,C),Ernest Rutherford,D),John Dalton,B',
  'B',
  'Structure of Atom'),
 (12,
  'Chemistry',
  'What is the charge of a neutron?',
  'A),Positive,B),Negative,C),Neutral,D),Depends on the isotope,C',
  'C',
  'Structure of Atom'),
 (13,
  'Chemistry',
  'Which experiment led to the discovery of the nucleus?',
  'A),Cathode Ray Experime nt,B),Gold Foil Experiment,C),Oil Drop Experiment,D),Photoelectric Effect,B',
  'B',
  'Structure of Atom'),
 (14,
  'Chemistry',
  'What is the maximum number of electrons that can occupy a p -orbital?',
  'A),2,B),6,C),10,D),14,B',
  'B',
  'Structure of Atom'),
 (15,
  'Chemistry',
  'Who developed the quantum mechanical model of the atom?',
  'A),Werner Heisenberg,B),Niels Bohr,C),Erwin Schrödinger,D),J.J. Thomson,C',
  'C',
  'Structure of Atom'),
 (16,
  'Chemistry',
  'What is the principal quantum number primarily associated with?',
  'A),Sha

In [10]:
dict = {}
dict[1]=1
dict[2]=2
dict

{1: 1, 2: 2}

In [8]:
list=[1,1,2,3,4]
list

[1, 1, 2, 3, 4]

In [34]:
import os
import PyPDF2
import re

pdf_path = os.path.join("./content", "Chemistry Questions.pdf")

pdffile_handler = open(pdf_path, "rb")
#attach a reader object
pdf_reader_data = PyPDF2.PdfReader(pdffile_handler)

text = ""
for page_index in range(len(pdf_reader_data.pages)):
    text += pdf_reader_data.pages[page_index].extract_text()

# Regular Expression to capture Chapter Names
chapter_pattern = re.compile(r"Chapter (\d+):\s*(.*?)(?=\s*Chapter \d+:|$)", re.DOTALL)
chapters = chapter_pattern.findall(text)

for chapter in chapters:
    chapter_name = chapter[1]
    print(f"chapter_name:{chapter_name.split("\n")[0]}")

chapter_name:Basic concepts of chemistry  
chapter_name:Structure of Atom  
chapter_name:Classification of elements and periodicity  
chapter_name:States of Matter  


In [14]:
dict [1]=1
dict[1]=1
dict[2]=2
dict[2]=3
dict

{1: 1, 2: 3}