In [1]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import re
import numpy as np
from datetime import datetime
from pathlib import Path

In [2]:
def getme(base, subfield):
    
    '''
    returns a value from within a tag
    '''
    
    try:
        return base.find("field", {"name": subfield}).value.text
    except AttributeError:
        return np.nan
    
def getme_date(base, subfield, t):
    
    '''
    returns date or time (based on the 't' parameter)
    '''
    
    try:
        if t == "date":
            return datetime.fromisoformat(base.find("field", {"name": subfield}).value.text).date().isoformat()
        else:
            return datetime.fromisoformat(base.find("field", {"name": subfield}).value.text).time().isoformat()                              
    except AttributeError:
        return np.nan
    
def getme_attribute(base, attr):
    
    '''
    returns attribute of a tag
    '''
    
    try:
        return base.get(attr)
    
    except AttributeError:
        return np.nan
    
def getme_attachment(base, subfield):
    
    '''
    returns attachment name and type
    '''
    
    try:
        a = base.find("model", {"type" : subfield})
        return a.find("field", {"name" : "Filename"}).value.text, a.find("field", {"name" : "ContentType"}).value.text
        
    except AttributeError:
        return np.nan,np.nan

def getme_from(base):
    
    '''
    returns the sender of a message
    '''

    try:
        return base.find("modelField", {"name": "From"}).find("field", {"name" : "Identifier"}).value.text
    except AttributeError:
        #print(base)
        return np.nan
    
def getme_id(base):
    
    '''
    returns the Chat id
    '''
    
    return base.parent.parent.get("id")

In [3]:
def make_export(soup,TYPE):
    
    parent_model =  TYPE[0][0]
    child_model = TYPE[0][1]
    values = TYPE[1].items()

    print(parent_model, child_model, values)
    
    reader = soup.find_all("modelType", {"type" : parent_model})[0]
   
    data = []
    
    for el in reader.find_all("model", {"type": child_model}):        
        row = {}
        for v in values:
            
            if v[1] == "TimeStamp":
                row[v[0]] = getme_date(el, v[1], v[0])
            elif v[0] == "erased":
                row[v[0]] = getme_attribute(el, v[1])
            elif v[0] == "attachment":
                row[v[0]], row["type"] = getme_attachment(el, v[1])
            elif v[0] == "od":
                row[v[0]] = getme_from(el)
            elif v[0] == "chat_id":
                row[v[0]] = getme_id(el)
            else: 
                row[v[0]] = getme(el, v[1])
        
        data.append(row)
      
    return data

In [4]:
data_types = {
    ("Chat", "InstantMessage") :    {
    "content" : "Body",
    "name" : "Name",
    "app" : "SourceApplication",
    "date" : "TimeStamp",
    "time" : "TimeStamp",
    "erased" : "deleted_state",
    "from" : ("From", "Identifier"),
    "attachment" : "Attachment",
    "chat_id" : "",
    }}

please be aware that parsing very big xmls (about 1 GB) this way will require at least 32 GB of RAM. lxml might be a better way to go

In [5]:
xml = "path/to/report.xml"
xml = Path(xml)
target_dir = xml.parent # the csv will be saved to the same folder as the xml

with open(xml) as infile:
    soup = bs(infile.read(), "xml").find("decodedData")

DT = [d for d in data_types.items()][0]
data = make_export(soup,DT)

df = pd.DataFrame(data)

df["app"] = df["app"].str.replace("\+?[0-9]+", "", regex=True).str.strip().str.replace(":","")
df["from"] = df["from"].str.replace("@s.whatsapp.net","").str.replace("+","")
df["erased"] = df["erased"].str.replace("Intact", "0").str.replace("Deleted", "1")
df["from"] = df["from"].fillna("owner") # if message was sent through viber, the 'from' part is empty. This is a hack and might backfire in unforeseen ways.

df.to_csv(Path(target_dir, "all_chats.csv"),  index=False)

Chat InstantMessage dict_items([('content', 'Body'), ('name', 'Name'), ('app', 'SourceApplication'), ('date', 'TimeStamp'), ('time', 'TimeStamp'), ('erased', 'deleted_state'), ('from', ('From', 'Identifier')), ('attachment', 'Attachment'), ('chat_id', '')])
