# about

Dieses Notebook dient der Übung des bisher gelernten. Es sollen Daten mit SpaCy exploriert werden

## load libraries / packages and models

In [4]:
# load packages
import spacy
# load nlp model
nlp_de = spacy.load('de_core_news_md') 

## load data

- as basic text string

In [5]:
text_sample = '''Paris Hilton reiste nach Paris um im dortigen Hilton zu übernachten'''

- from file: note that the input data type for the NLP tagger is a string, which means that the file contents must be read and converted into a string with the `str()` function
    - note that there is a maximum length of individual strings
    - note that the encoding should be specified as best practice

In [53]:
#file_sample = open("../data/sample.txt", "a")
with open("../data/sample.txt", "r", encoding = "utf8") as file:
    file_sample = file.read()
#print(file_sample)

# tag data

In [54]:
doc_sample_text = nlp_de(text_sample)

In [55]:
doc_sample_file = nlp_de(file_sample)

In [59]:
doc = doc_sample_file
for token in doc[10:20]:
    print(f'''Token: {token}  
              Lemmata: {token.lemma_} 
              POS: {token.pos_}
              explain POS: {spacy.explain(token.pos_)}
              TAG: {token.tag_}
              explain TAG: {spacy.explain(token.tag_)}
              Dependency: {token.dep_}
              explain Dependency: {spacy.explain(token.dep_)}
              Entity: {token.ent_type_}''')

Token: 


  
              Lemmata: 


 
              POS: SPACE
              explain POS: space
              TAG: _SP
              explain TAG: whitespace
              Dependency: dep
              explain Dependency: unclassified dependent
              Entity: 
Token: “  
              Lemmata: “ 
              POS: PUNCT
              explain POS: punctuation
              TAG: $(
              explain TAG: other sentence-internal punctuation mark
              Dependency: punct
              explain Dependency: punctuation
              Entity: 
Token: Oberleutnant  
              Lemmata: Oberleutnant 
              POS: NOUN
              explain POS: noun
              TAG: NN
              explain TAG: noun, singular or mass
              Dependency: sb
              explain Dependency: subject
              Entity: 
Token: möchten  
              Lemmata: mögen 
              POS: AUX
              explain POS: auxiliary
              TAG: VMFIN
              explain TAG

# check for named entites
## Personen

In [61]:
for token in doc:

    if token.ent_type_ == 'PER':
       print(token, token.ent_type_)


Herrn PER
Hauptmann PER
Sch PER
. PER
Szegedin PER
Szegedin PER
Szegedin PER
Szegedin PER
v. PER
R. PER
Zögernd PER
v. PER
W. PER
Pußta PER
“ PER
. PER
Nisch PER
Landfuhrwerke PER
Buuardschik PER
Tepe PER
Bulgarisch PER
. PER
Nie PER
Wau PER
, PER
wau PER
” PER
Konstantinopel PER
K. PER
ablaufen PER
: PER
Scheich PER
’s PER
Tunnel PER
Eskischekir PER
Motorpflug PER
Xenophon PER
Friedrich PER
Barbarossa PER
Kamelkarawanen PER
Tscham PER
Alan PER
Han PER
Tscham PER
Alan PER
ver PER
zeichnen PER
. PER
Alexander PER
der PER
Große PER
Gottfried PER
von PER
Bouillon PER
Ibrahim PER
Paschas PER
Befesti PER
Hans PER
Hans PER
Ma- PER
mure PER
Jslahie PER
Gülek PER
Gülek PER
Paulus PER
Alexander PER
des PER
Großen PER
gefährlichem PER
Bad PER
Gülek PER
H. PER
zusammentraf PER
. PER
Mamure PER
Amanus PER
Goltz PER
Tschan PER
Alan PER

  PER
in PER
v. PER
Mücke PER
Sven PER
Hedin PER
Bienenkörbe PER
ben PER
will PER
Hermon PER
“ PER
. PER
Hermon PER
Dera PER
’ PER
at PER
Schroff PER
Hell PER
Weibe

In [78]:
pers_set = { str(token) for token in doc  if token.ent_type_ == 'PER' } # data type is set
#pers_list = list(pers_set)
print(sorted(pers_set))


['\n ', ' ', ',', '.', ':', '>', 'Abendland', 'Abraham', 'Alan', 'Alexander', 'Allah', 'Also', 'Amanus', 'Augenklappern', 'Baalbek', 'Bad', 'Barbarossa', 'Basar', 'Befesti', 'Begleiter', 'Bienenkörbe', 'Birseba', 'Blendender', 'Bouillon', 'Browning', 'Bulgarisch', 'Buuardschik', 'Christus', 'Damen', 'Decka', 'Dera', 'Durchgang', 'Einzeln', 'Eskischekir', 'Festessen', 'Friedrich', 'Geburtskirche', 'Goltz', 'Gottfried', 'Grabe', 'Große', 'Großen', 'Gülek', 'H.', 'Han', 'Hans', 'Hauptmann', 'Hedin', 'Hell', 'Hermon', 'Herrn', 'Himmels', 'Hnssin', 'Hussin', 'Ibrahim', 'Jahren', 'Jahrtausendelang', 'Jakobs', 'Jesus', 'Jslahie', 'Jungfrau', 'K', 'K.', 'Kaimakam', 'Kamelkarawanen', 'Kana', 'Kanonenplatz', 'Karl', 'Kino', 'Konstantinopel', 'Kurs', 'Landfuhrwerke', 'Ma-', 'Mamure', 'Mandoline', 'Maria', 'Marschsicherungen', 'Mastix', 'May', 'Mondaufgang', 'Motorpflug', 'Mücke', 'Nie', 'Nisch', 'Ns-', 'Paschas', 'Paulus', 'Pußta', 'R.', 'Rebekkas', 'Rückreise', 'Sauls', 'Sch', 'Scheich', 'Schmuc

In [114]:
place_list = { str(token.lemma_) for token in doc  if token.ent_type_ == 'LOC' }
print(sorted(place_list))

[' ', ' \n', ')', ',', '.', '15', '3800', ':', 'Abraham', 'Adana', 'Adrianopel', 'Ain', 'Alan', 'Aleppo', 'AlexandreM', 'Alexandrien', 'Alexinac', 'Aley', 'Altstadt', 'Amanns', 'Amanus', 'Amara', 'Antilibanon', 'Anziehendste', 'Araber', 'Arbata', 'Arisch', 'Arus', 'Asien', 'Asten', 'Au', 'Audja', 'Ausficht', 'Ausläufer', 'Baalbek', 'Bacchustempel', 'Bagdad', 'Bagdad-Bahn', 'Bagdadbahn', 'Bagrdam', 'Bahn', 'Bahnhof', 'Bahu', 'Balkan', 'Balkanzug', 'Balkanzuges', 'Barada', 'Basalt-', 'Baul', 'Beete', 'Beirut', 'BekÄa', 'Belgrad', 'Berg', 'Berlin', 'Berlin-Schöneberg', 'Besichtigun', 'Bethanien', 'Bethlehem', 'Bezirk', 'Birseba', 'Birseba—', 'Bithynien', 'Bogas', 'Bosporus', 'Bourgas', 'Bozanti', 'Brand', 'Brandstelle', 'Brunnenanlage', 'Brusthöhe', 'Brücke', 'Budapest', 'Bulgare', 'Bulgarien', 'Bund', 'Bundeslade', 'Burghof', 'Chabur', 'Cölesyrien', 'Dagh', 'Damaskus', 'Damaskus-Alep', 'Damaszener', 'Dardanellengefechten', 'Deika', 'Deutschland', 'Diar-bekir', 'Dicke', 'Donau', 'Dorfausg

# Visualisierung

In [80]:
from spacy import displacy

In [None]:
displacy.render(doc_sample_file, style='ent', jupyter=True)

# Count entities

- return everything as list
- count list with `list.count()`

In [112]:
child_list = [list(token.children) for token in doc]

SyntaxError: invalid syntax (3306520044.py, line 2)

AttributeError: 'spacy.tokens.token.Token' object has no attribute 'doc_sample_text'