In [None]:
! python3 --version

In [None]:
import pandas as pd
import re
import xml.dom.minidom
import warnings
warnings.filterwarnings("ignore")

In [None]:
# for file unpacking, if needed
#! xzcat data.tsv.xz > data.tsv

In [None]:
infile = 'data.tsv'

with open(infile, 'rb') as f:
    df = pd.read_table(f, encoding='utf-8')
df.tail() 

# Statistics

In [None]:
df.info()

In [None]:
df.char_count.sum()

In [None]:
df.nlargest(3, 'char_count')

In [None]:
df.describe(include='object')

In [None]:
df.dateline.value_counts()    # medium_name

In [None]:
df[df.rubric=='Briefe an die NZZ']

In [None]:
df.groupby('rubric').char_count.mean().round(0)

In [None]:
df.sort_values(by='rubric', key=lambda col: col.str.lower())

# Search in various fields

In [None]:
# Searching in various columns with - use regex
# Example: select all items with the strings *kauf* OR *haus* in their titles
df[df['head'].str.contains('kauf|haus', regex=True)]        

In [None]:
# Select all items from rubric "Meinung und Debatte" containing "schlecht" in the fulltext
df[df['rubric'].str.contains('Meinung und Debatte', regex=True) & df['content'].str.contains('schlecht', regex=True)]

In [None]:
# Select all items from rubrics starts with "NZZ" (+ ignore "NA"s)
df[df['rubric'].str.contains(r'^NZZ', na=False)]

# Handle XML tags

## Delete all XML tags - quick & plain

In [None]:
# View sample XML content
df.content[87]

In [None]:
# Function to remove all mark-up tags
def remove_tags(text_with_markup):
    cleanr = re.compile('<[^<]+>')      # regex for tags
    cleantext = re.sub(cleanr, ' ', text_with_markup)     # remove XML tags
    cleantext = re.sub(' +', ' ', cleantext)      # remove multiple white spaces, resulting from deleting tags
    return cleantext.strip()

In [None]:
remove_tags(df.content[87])

In [None]:
# Count characters
len(remove_tags(df.content[87]))

In [None]:
# Count words
len(remove_tags(df.content[87]).split())

In [None]:
# Count string in text
remove_tags(df.content[87]).count("Bühne")

## Write XML content to file

In [None]:
filename = 'winti_culture.xml'     
with open(filename,'w') as f:
    f.write(df.content[94])

## Parse XML

In [None]:
# Parse XML from file
doc = xml.dom.minidom.parse('winti_culture.xml')

# Parse XML from datafield
# doc = xml.dom.minidom.parseString(df.content[1])

## Check all XML tags in a text

In [None]:
# List all unique XML tags
tags = set([elem.tagName for elem in doc.getElementsByTagName("*")])
print(tags)

In [None]:
# For definitions of the tags see the Swissdox@LiRI Wiki
from IPython.display import IFrame
website_url = "https://liri.linguistik.uzh.ch/wiki/langtech/swissdox/core/content"
width = 800
height = 400
iframe = IFrame(website_url, width=width, height=height)
display(iframe)

## Read whole text in pretty  XML

In [None]:
# Print whole text with pretty XML
tags = doc.getElementsByTagName('tx') 
for tag in tags:
    print(tag.toprettyxml())

## Read certain XML tags and their content

In [None]:
# Print certain XML tags with their content
tags = doc.getElementsByTagName('au')     
for tag in tags:
    print(tag.toprettyxml())

In [None]:
# Print certain XML tags with their content
tags = doc.getElementsByTagName('lg')     # p
for tag in tags:
    print(tag.toprettyxml())

In [None]:
# Print certain XML tag's content, incl subtag's contents
tags = doc.getElementsByTagName('lg')     # ka
for tag in tags:
    print(remove_tags(tag.toxml()))

In [None]:
# Print certain XML tags with their content, incl subtag contents
tags = doc.getElementsByTagName('p')     
for tag in tags:
    print(remove_tags(tag.toxml()))

In [None]:
df.content[1][:5000]

In [None]:
# Parse XML from datafield
doc = xml.dom.minidom.parseString(df.content[1])

In [None]:
# List all unique XML tags
tags = set([elem.tagName for elem in doc.getElementsByTagName("*")])
print(tags)

In [None]:
# Get contents of a certain XML tag of a certain row
def get_tags(tag, i):
    text = df.content[i]
    doc = xml.dom.minidom.parseString(text)
    return [remove_tags(t.toxml()) for t in doc.getElementsByTagName(tag)]
    
get_tags('au', 103)

## Select authors

In [None]:
# Get the authors of "contetnt" into a new column
df['authors'] = ''
for i in df.index:
    df['authors'][i] = get_tags('au', i)

In [None]:
# Doing the same for the lead texts
df['opener'] = ''
for i in df.index:
    df['opener'][i] = get_tags('ld', i)

In [None]:
df.head()

In [None]:
# Caveat: Unfortunately, not ot all authors are tagged with <au>
df.content[3]

In [None]:
# Example: Author information  in last <p> element
df.content[201][6000:]

In [None]:
# And author information appears in various forms
[i for i in df.authors][20:40]