In [None]:
import os

def get_all_xml_files_in_a_folder(folder_path):
  xml_files = []

  for root, dirs, files in os.walk(folder_path):
      for file in files:
          if file.endswith('.xml'):
              xml_files.append(os.path.join(root, file))

  return xml_files

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd

def extract_data_from_xml(xml_file):
    individual_writings = pd.DataFrame(columns = ['Title', 'Date', 'Text', 'Info'])
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()

        for writing in root.findall('WRITING'):
          writing_title = writing.find('TITLE').text
          writing_date = writing.find('DATE').text
          writing_text = writing.find('TEXT').text
          writing_info = writing.find('INFO').text
          individual_writings = pd.concat([
              individual_writings,
              pd.DataFrame.from_dict({"Title": [writing_title], "Date": [writing_date], "Text": [writing_text], "Info": [writing_info] })
          ], ignore_index=True)

        train_subject_id = root.find('ID').text
        individual_writings['TrainSubjectId'] = train_subject_id

        return individual_writings
    except Exception as e:
        print(f"Error parsing {xml_file}: {str(e)}")
        return None

In [None]:
def extract_data_from_folder_and_save_to_pickle(folder_path, save_to_path):
  xml_files = get_all_xml_files_in_a_folder(folder_path)

  df = pd.DataFrame()

  for xml_file in xml_files:
    data_of_xml_file = extract_data_from_xml(xml_file)
    if data_of_xml_file == None:
      continue
    df = pd.concat([df, data_of_xml_file])

  df.to_pickle(save_to_path)

  return df