In [17]:
import re

def clean_text(text):
    # Fix common encoding issues
    return text.replace('â€“', '-')\
               .replace('â€', '"')\
               .replace('â€œ', '"')\
               .replace('â€™', "'")\
               .replace('â€˜', "'")\
               .replace('Â', '')\
               .strip()

def parse_questions(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
    except UnicodeDecodeError:
        print("UnicodeDecodeError: Trying with 'latin-1' encoding")
        with open(file_path, 'r', encoding='latin-1') as file:
            lines = file.readlines()

    data = []
    question_data = {}
    question_number = 1
    question_text = ''
    options = {"a": "", "b": "", "c": "", "d": ""}
    answer = ""
    reference = ""
    collecting_reference = False

    # Regex patterns
    question_start = re.compile(r'^\d+\.\s')
    option_pattern = re.compile(r'^\([a-d]\)\s*(.*)', re.IGNORECASE)
    answer_pattern = re.compile(r'^Answer:\s*([A-D])\.', re.IGNORECASE)
    reference_start_pattern = re.compile(r'^Reference:', re.IGNORECASE)

    for line in lines:
        line = clean_text(line.strip())
        if not line:
            continue

        if question_start.match(line):
            if question_data:
                question_data.update({
                    'a': options['a'],
                    'b': options['b'],
                    'c': options['c'],
                    'd': options['d'],
                    'sol': answer,
                    'ref': reference.strip()
                })
                data.append(question_data)

            # New question
            question_text = line
            options = {"a": "", "b": "", "c": "", "d": ""}
            answer = ""
            reference = ""
            collecting_reference = False
            question_data = {
                "sr no": question_number,
                "question": question_text,
                "a": "", "b": "", "c": "", "d": "",
                "sol": "", "ref": ""
            }
            question_number += 1

        elif option_pattern.match(line):
            match = option_pattern.match(line)
            if match:
                opt_letter = line[1].lower()
                options[opt_letter] = match.group(1)

        elif answer_pattern.match(line):
            collecting_reference = False
            match = answer_pattern.match(line)
            if match:
                answer = match.group(1).upper()

        elif reference_start_pattern.match(line):
            collecting_reference = True
            reference = line.replace('Reference:', '').strip()

        elif collecting_reference:
            reference += ' ' + line  # Continue appending to reference

        else:
            question_text += ' ' + line

    # Add the last question
    if question_data:
        question_data.update({
            'a': options['a'],
            'b': options['b'],
            'c': options['c'],
            'd': options['d'],
            'sol': answer,
            'ref': reference.strip()
        })
        data.append(question_data)

    return data
