In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.enableHiveSupport().getOrCreate()

In [None]:
import re
import html

pattern = re.compile(' ([A-Za-z]+)="([^"]*)"')
parse_line = lambda line: {key:value for key,value in pattern.findall(line)}
unescape = udf(lambda escaped: html.unescape(escaped) if escaped else None)

def read_tags_raw(tags_string): # converts <tag1><tag2> to ['tag1', 'tag2']
    return html.unescape(tags_string).strip('>').strip('<').split('><') if tags_string else []
    
read_tags = udf(read_tags_raw, ArrayType(StringType()))

In [None]:
spark.read.text('file:///c/Users/Mariusz/Desktop/Badges.xml').where(col('value').like('%<row Id%')) \
    .select(udf(parse_line, MapType(StringType(), StringType()))('value').alias('value')) \
    .select(
        col('value.Id').cast('integer'),
        col('value.UserId').cast('integer'),
        col('value.Name'),
        col('value.Date').cast('timestamp'),
        col('value.Class').cast('integer'),
        col('value.TagBased').cast('boolean')
    ).write.parquet('/data/stackoverflow/parquet/Badges')