# CSV to XML

In [648]:
from datetime import datetime

def convert_date_format(date_str):
    return date_str
    try:
        # Parse the date from MM/DD/YYYY format
        date_obj = datetime.strptime(date_str, '%d/%m/%Y')
        
        # Format the date to YYYY-MM-DD format
        new_date_str = date_obj.strftime('%Y-%m-%d')
        return new_date_str
    except ValueError as e:
        return f"Error: {e}"

def format_with_leading_zero(value):
    # Check if the value is a single-digit integer
    if isinstance(value, int) and 0 <= value <= 9:
        return f"{value:02d}"  # Format with leading zero
    else:
        return str(value)  # Convert to string without leading zero

def remove_specific_chars(input_string):
    chars_to_remove = ' -;=?-~ -ɏḀ-ỿ–'
    replacements = {'ă': 'a'}  # Characters to be replaced
    # Create a translation table for replacements
    translation_table = str.maketrans(replacements)

    # Replace specified characters
    replaced_string = input_string.translate(translation_table)
    replaced_characters = ''.join(char for char in replaced_string if char not in chars_to_remove)

    #print(f'{replaced_string}, {replaced_characters}')

    #if replaced_characters != input_string:
        #print(f'{input_string} to {replaced_characters}')
    
    # Check if the input string is empty
    if not replaced_characters:
        replaced_characters = 'NoSurname'    
    return replaced_characters

In [693]:
import pandas as pd
import xml.etree.ElementTree as ET

# Path to your CSV file
csv_file_path = 'Student_Data_HESA_12_30.csv'
#csv_file_path = 'module_id.csv'

# Read the CSV file
csv_data = pd.read_csv(csv_file_path)
csv_data['SID'] = csv_data['SID'].apply(lambda x: '{:.0f}'.format(x) if pd.notna(x) else None)

# Create the root XML element
root = ET.Element("DataFutures")

# Load the CSV file
#module_df = pd.read_csv('module_id.csv')

# Number of 'M' columns per group
m_cols = ['MODINSTID', 'MODID', 'MODINSTENDDATE', 'MODINSTSTARTDATE']
#num_groups = int((len(module_df.columns) - 1) / len(m_cols))  # Subtract 1 for 'ID', divide by number of M columns
num_groups = 8
# Split and concatenate
all_dfs = []
for i in range(num_groups):
    pre_cols = [f'{m}{i+1}' for m in m_cols]
    cols = ['SID'] + pre_cols
    temp_df = csv_data[cols].copy()
    temp_df.columns = ['SID'] + m_cols  # Rename columns to standard names
    all_dfs.append(temp_df)

# Concatenate all DataFrames
converted_module_df = pd.concat(all_dfs, ignore_index=True)

# Saving the transformed data to a new CSV file
converted_module_df.to_csv('transformed_file.csv', index=False)

# Applying the detailed association mapping for all students
for _, row in csv_data.iterrows():
    if row['SID'] in ['19100320360960048', '20100320360960168', '20100320360960240', '23100320360260128', '23100320360260368', '23100320360270140', '23100320360270208']:
        continue
        
    student = ET.SubElement(root, "Student")

    # Direct mappings
    ET.SubElement(student, "SID").text = str(row['SID']) if pd.notna(row['SID']) else None
    ET.SubElement(student, "BIRTHDTE").text = convert_date_format(str(row['BIRTHDTE'])) if pd.notna(row['BIRTHDTE']) else None
    ET.SubElement(student, "FNAMES").text = (str(row['FNAMES'])) if pd.notna(row['FNAMES']) else None
    ET.SubElement(student, "GENDERID").text = format_with_leading_zero((row['GENDERID'])) if pd.notna(row['GENDERID']) else None
    ET.SubElement(student, "NATION").text = (str(row['NATION'])) if pd.notna(row['NATION']) else None
    ET.SubElement(student, "RELIGION").text = (str(row['RELIGION'])) if pd.notna(row['RELIGION']) else None    
    ET.SubElement(student, "SEXID").text = str(row['SEXID']) if pd.notna(row['SEXID']) else None
    ET.SubElement(student, "SURNAME").text = remove_specific_chars(str(row['SURNAME'])) if pd.notna(row['SURNAME']) else 'NoSurname'
    ET.SubElement(student, "TTACCOM").text = format_with_leading_zero((row['TTACCOM'])) if pd.notna(row['TTACCOM']) else None            
    ET.SubElement(student, "TTPCODE").text = (str(row['TTPCODE'])) if pd.notna(row['TTPCODE']) else None        
    
    # Nested elements
    disability = ET.SubElement(student, "Disability")
    ET.SubElement(disability, "DISABILITY").text = str(row['DISABILITY']) if pd.notna(row['DISABILITY']) else None

    engagement = ET.SubElement(student, "Engagement")
    ET.SubElement(engagement, "NUMHUS").text = str(row['NUMHUS']) if pd.notna(row['NUMHUS']) else None
    if pd.notna(row['ENGEXPECTEDENDDATE']):
        ET.SubElement(engagement, "ENGEXPECTEDENDDATE").text = convert_date_format(str(row['ENGEXPECTEDENDDATE']))
    engStartDate = convert_date_format(str(row['ENGSTARTDATE'])) if pd.notna(row['ENGSTARTDATE']) else None
    ET.SubElement(engagement, "ENGSTARTDATE").text = engStartDate
    ET.SubElement(engagement, "FEEELIG").text = format_with_leading_zero((row['FEEELIG'])) if pd.notna(row['FEEELIG']) else None

    #entry_profile = ET.SubElement(engagement, "EntryProfile")
    #entryQualificationAward = ET.SubElement(entry_profile, "EntryQualificationAward")
    #ET.SubElement(entryQualificationAward, "ENTRYQUALAWARDID").text = str(row['QUALAWARDID']) if pd.notna(row['QUALAWARDID']) else None
    #ET.SubElement(entryQualificationAward, "ENTRYQUALAWARDRESULT").text = str(row['QUALAWARDRESULT']) if pd.notna(row['QUALAWARDRESULT']) else None
    #ET.SubElement(entryQualificationAward, "QUALTYPEID").text = str(row['QUALID']) if pd.notna(row['QUALID']) else None

    #entryQualificationSubject = ET.SubElement(entryQualificationAward, "EntryQualificationSubject")    
    #ET.SubElement(entryQualificationSubject, "SUBJECTID").text = str(row['SUBJECTID']) if pd.notna(row['SUBJECTID']) else None

    if pd.notna(row['ENGENDDATE']):
        leaver = ET.SubElement(engagement, "Leaver")
        ET.SubElement(leaver, "ENGENDDATE").text = convert_date_format(str(row['ENGENDDATE'])) if pd.notna(row['ENGENDDATE']) else None
        if pd.notna(row['RSNENGEND']):
            ET.SubElement(leaver, "RSNENGEND").text = format_with_leading_zero(int(row['RSNENGEND']))

    #studentAccreditationAim = ET.SubElement(engagement, "StudentAccreditationAim")
    #ET.SubElement(studentAccreditationAim, "STUACCID").text = str(row['STUACCID']) if pd.notna(row['STUACCID']) else None

    if pd.notna(row['SCSSTARTDATE']):
        studentCourseSession = ET.SubElement(engagement, "StudentCourseSession")
        ET.SubElement(studentCourseSession, "SCSESSIONID").text = str(row['SCSESSIONID']) if pd.notna(row['SCSESSIONID']) else None
        ET.SubElement(studentCourseSession, "COURSEID").text = str(row['COURSEID']) if pd.notna(row['COURSEID']) else None
        ET.SubElement(studentCourseSession, "INVOICEFEEAMOUNT").text = str(row['INVOICEFEEAMOUNT']) if pd.notna(row['INVOICEFEEAMOUNT']) else None
        if pd.notna(row['SCSENDDATE']):
            ET.SubElement(studentCourseSession, "SCSENDDATE").text = convert_date_format(str(row['SCSENDDATE']))
        if pd.notna(row['SCSEXPECTEDENDDATE']):
            ET.SubElement(studentCourseSession, "SCSEXPECTEDENDDATE").text = convert_date_format(str(row['SCSEXPECTEDENDDATE']))
        ET.SubElement(studentCourseSession, "SCSFEEAMOUNT").text = str(row['SCSFEEAMOUNT']) if pd.notna(row['SCSFEEAMOUNT']) else None
        ET.SubElement(studentCourseSession, "SCSMODE").text = format_with_leading_zero((row['SCSMODE'])) if pd.notna(row['SCSMODE']) else None
        if pd.notna(row['SCSSTARTDATE']):
            ET.SubElement(studentCourseSession, "SCSSTARTDATE").text = convert_date_format(str(row['SCSSTARTDATE'])) if pd.notna(row['SCSSTARTDATE']) else ''
            if pd.notna(row['SESSIONYEARID']):
                ET.SubElement(studentCourseSession, "SESSIONYEARID").text = str(row['SESSIONYEARID'])
        #if pd.notna(row['STULOAD']):
        #    ET.SubElement(studentCourseSession, "STULOAD").text = str(row['STULOAD'])
        if pd.notna(row['YEARPRG']) and pd.notna(row['SCSSTARTDATE']):
            yearprg = row['YEARPRG']
            ET.SubElement(studentCourseSession, "YEARPRG").text = str(int(yearprg))

    sid = str(row['SID'])
    student_specific_module_df = converted_module_df[converted_module_df['SID'] == sid]
    if len(student_specific_module_df) > 0:
        count = 1
        for __, module_row in student_specific_module_df.iterrows():
            if pd.notna(module_row['MODINSTID']):
                moduleInstance = ET.SubElement(studentCourseSession, "ModuleInstance")
                ET.SubElement(moduleInstance, "MODINSTID").text = str(row['SID'].strip()+module_row['MODINSTID'].strip()+str(count).strip())
                ET.SubElement(moduleInstance, "MODID").text = str(module_row['MODID']).strip() if pd.notna(module_row['MODID']) else None
                ET.SubElement(moduleInstance, "MODINSTENDDATE").text = convert_date_format(str(module_row['MODINSTENDDATE'])) if pd.notna(module_row['MODINSTENDDATE']) else None
                ET.SubElement(moduleInstance, "MODINSTSTARTDATE").text = convert_date_format(str(module_row['MODINSTSTARTDATE'])) if pd.notna(module_row['MODINSTSTARTDATE']) else None
                count = count + 1

    if pd.notna(row['SCSSTARTDATE']):
        studyLocation = ET.SubElement(studentCourseSession, "StudyLocation")
        ET.SubElement(studyLocation, "STUDYLOCID").text = str(row['STUDYLOCID']) if pd.notna(row['STUDYLOCID']) else None
        ET.SubElement(studyLocation, "STUDYPROPORTION").text = str(row['STUDYPROPORTION']) if pd.notna(row['STUDYPROPORTION']) else None

        if row['COURSEID']=='8N101':                
            supervisorAllocation = ET.SubElement(studentCourseSession, "SupervisorAllocation")
            ET.SubElement(supervisorAllocation, "SUPALLID").text = str(row['SUPALLID']) if pd.notna(row['SUPALLID']) else None        
            ET.SubElement(supervisorAllocation, "SUPALLPROP").text = str(row['SUPALLPROP']) if pd.notna(row['SUPALLPROP']) else None                
            
            fundingAndMonitoring = ET.SubElement(studentCourseSession, "FundingAndMonitoring")
            ET.SubElement(fundingAndMonitoring, "FUNDLENGTH").text = format_with_leading_zero(int(row['FUNDLENGTH'])) if pd.notna(row['FUNDLENGTH']) else '02'
            ET.SubElement(fundingAndMonitoring, "NONREGFEE").text = '01'

# Path to your CSV file
csv_file_path = 'Student Data HESA 30 Nov 1330.csv'

# Read the CSV file
csv_data = pd.read_csv(csv_file_path)

venue = ET.SubElement(root, "Venue")
ET.SubElement(venue, "VENUEID").text    = "A"
ET.SubElement(venue, "POSTCODE").text   = "WC1E 7HX"
ET.SubElement(venue, "VENUENAME").text  = "Birkbeck College, University of London"
ET.SubElement(venue, "VENUEUKPRN").text = "10032036"

# Applying the detailed association mapping for all students
for _, row in csv_data.iterrows():
    break
    module = ET.SubElement(root, "Module")

    # Direct mappings
    ET.SubElement(module, "MODID").text = str(row['MODID']) if pd.notna(row['MODID']) else None
    ET.SubElement(module, "FTE").text = (str(row['FTE'])) if pd.notna(row['FTE']) else None
    ET.SubElement(module, "MTITLE").text = (str(row['MTITLE'])) if pd.notna(row['MTITLE']) else None
    
    engagement = ET.SubElement(module, "ModuleCostCentre")
    ET.SubElement(engagement, "COSTCN").text = str(row['COSTCN']) if pd.notna(row['COSTCN']) else None
    ET.SubElement(engagement, "COSTCNPROPORTION").text = (str(row['COSTCNPROPORTION'])) if pd.notna(row['COSTCNPROPORTION']) else None
    #break

# Path to save the output XML file
output_xml_file_path = 'Student_Data_HESA_all_V2.xml'

# Save the XML to a file
tree = ET.ElementTree(root)
tree.write(output_xml_file_path)
print('All generation completed.')

All generation completed.


In [694]:
from lxml import etree

def validate_xml(xsd_path, xml_path):
    try:
        # Load and parse the XSD file
        with open(xsd_path, 'r') as schema_file:
            schema_doc = etree.parse(schema_file)
        schema = etree.XMLSchema(schema_doc)

        # Load and parse the XML file
        with open(xml_path, 'r') as xml_file:
            xml_doc = etree.parse(xml_file)

        # Validate the XML against the schema
        if schema.validate(xml_doc):
            print("XML is valid against the XSD.")
        else:
            print("XML is invalid against the XSD. Errors:")
            for error in schema.error_log:
                print(error.message)

    except etree.XMLSchemaParseError as e:
        print("Error in the XSD file:", e)
    except Exception as e:
        print("An error occurred:", e)

#xsd_file = 'your_xsd_file.xsd'  # Replace with your XSD file path
#xml_file = 'your_xml_file.xml'  # Replace with your XML file path

#validate_xml(xsd_file, xml_file)

In [695]:
xml_file = 'Student_Data_HESA_all_V2.xml'  # Replace with your XSD file path
xsd_file = '22056_1_10_0.xsd'  # Replace with your XML file path

validate_xml(xsd_file, xml_file)

XML is invalid against the XSD. Errors:
Element 'FundingAndMonitoring': This element is not expected. Expected is ( SupervisorAllocation ).
Element 'FundingAndMonitoring': This element is not expected. Expected is ( SupervisorAllocation ).
Element 'TTPCODE': [facet 'pattern'] The value 'UBA4 0NB' is not accepted by the pattern '[A-Za-z]{1,2}[0-9Rr][0-9A-Za-z]? ?([0-9][ABD-HJLNP-UW-Zabd-hjlnp-uw-z]{2})?'.


In [406]:
import pandas as pd
import xml.etree.ElementTree as ET

# Path to your CSV file
csv_file_path = 'Student Data HESA 30 Nov 1330.csv'

# Read the CSV file
csv_data = pd.read_csv(csv_file_path)

# Create the root XML element
root = ET.Element("DataFutures")

# Applying the detailed association mapping for all students
for _, row in csv_data.iterrows():
    module = ET.SubElement(root, "Module")

    # Direct mappings
    ET.SubElement(module, "MODID").text = str(row['MODID']) if pd.notna(row['MODID']) else None
    ET.SubElement(module, "FTE").text = (str(row['FTE'])) if pd.notna(row['FTE']) else None
    ET.SubElement(module, "MTITLE").text = (str(row['MTITLE'])) if pd.notna(row['MTITLE']) else None
    
    engagement = ET.SubElement(module, "ModuleCostCentre")
    ET.SubElement(engagement, "COSTCN").text = str(row['COSTCN']) if pd.notna(row['COSTCN']) else None
    ET.SubElement(engagement, "COSTCNPROPORTION").text = (str(row['COSTCNPROPORTION'])) if pd.notna(row['COSTCNPROPORTION']) else None
    #break
 
# Path to save the output XML file
output_xml_file_path = 'Student_Data_HESA_30_Nov_1330.xml'

# Save the XML to a file
tree = ET.ElementTree(root)
tree.write(output_xml_file_path)
print('All generation completed.')

All generation completed.
