# Python für NLP

Angelehnt an D. Sarkar: Text Analytics with Python (2nd Edition)

Ergänzt und aktualisiert von Heiko Rölke

## Arbeiten mit Zeichenketten (Strings)

In [1]:
new_string = "This is a String"  # storing a string

print('ID:', id(new_string))  # shows the object identifier (address)
print('Type:', type(new_string))  # shows the object type
print('Value:', new_string)  # shows the object value

ID: 1334184661744
Type: <class 'str'>
Value: This is a String


In [None]:
# simple string
simple_string = 'Hello!' + " I'm a simple string"
print(simple_string)

In [4]:
# multi-line string, note the \n (newline) escape character automatically created
multi_line_string = """Hello I'm
a multi-line
string!"""

multi_line_string

"Hello I'm\na multi-line\nstring!"

In [7]:
print(multi_line_string)

Hello I'm
a multi-line
string!


In [8]:
# Normal string with escape sequences leading to a wrong file path!
escaped_string = "C:\the_folder\new_dir\file.txt"
print(escaped_string)  # will cause errors if we try to open a file here

C:	he_folder
ew_dirile.txt


In [9]:
# raw string keeping the backslashes in its normal form
raw_string = r'C:\the_folder\new_dir\file.txt'
print(raw_string)

C:\the_folder\new_dir\file.txt


In [14]:
# unicode string literals
string_with_unicode = 'H\u00e8llo!'
print(string_with_unicode)

Hèllo!


In [15]:
more_unicode = 'I love Pizza 🍕!  Shall we book a cab 🚕 to get pizza?'
print(more_unicode)

I love Pizza 🍕!  Shall we book a cab 🚕 to get pizza?


In [16]:
print(string_with_unicode + '\n' + more_unicode)

Hèllo!
I love Pizza 🍕!  Shall we book a cab 🚕 to get pizza?


In [22]:
' '.join([string_with_unicode, more_unicode])

'Hèllo! I love Pizza 🍕!  Shall we book a cab 🚕 to get pizza?'

## Operationen mit Zeichenketten

### Verkettung

In [None]:
'Hello 😊' + ' and welcome ' + 'to Python 🐍!'

In [None]:
'Hello 😊' ' and welcome ' 'to Python 🐍!'

In [10]:
# concatenation of variables and literals
s1 = 'Python 💻!'
'Hello 😊 ' + s1

'Hello 😊 Python 💻!'

In [None]:
'Hello 😊 ' s1

In [11]:
# some more ways of concatenating strings
s2 = '--🐍Python🐍--'
s2 * 5

'--🐍Python🐍----🐍Python🐍----🐍Python🐍----🐍Python🐍----🐍Python🐍--'

In [12]:
s1 + s2
(s1 + s2) * 3

'Python 💻!--🐍Python🐍--Python 💻!--🐍Python🐍--Python 💻!--🐍Python🐍--'

In [None]:
# concatenating several strings together in parentheses
s3 = ('This '
      'is another way '
      'to concatenate '
      'several strings!')
s3

In [None]:
# checking for substrings in a string
'way' in s3

In [None]:
'python' in s3

In [None]:
# computing total length of the string
len(s3)

## Indexing and Slicing

In [23]:
s = 'PYTHON'
# depicting string indexes
for index, character in enumerate(s):
    print('Character ->', character, 'has index->', index)

Character -> P has index-> 0
Character -> Y has index-> 1
Character -> T has index-> 2
Character -> H has index-> 3
Character -> O has index-> 4
Character -> N has index-> 5


In [24]:
s[0], s[1], s[2], s[3], s[4], s[5]

('P', 'Y', 'T', 'H', 'O', 'N')

In [None]:
s[-1], s[-2], s[-3], s[-4], s[-5], s[-6]

In [25]:
## String slicing
s[:]

'PYTHON'

In [26]:
s[1:4]

'YTH'

In [27]:
s[:3], s[3:]

('PYT', 'HON')

In [None]:
s[-3:]

In [None]:
s[:3] + s[3:]

In [28]:
s[:3] + s[-3:]

'PYTHON'

### String slicing with offsets

In [29]:
s[::1]  # no offset

'PYTHON'

In [30]:
s[::2]  # print every 2nd character in string

'PTO'

In [31]:
s[::-1]  # reverses the string

'NOHTYP'

### Zeichenketten sind unveränderlich

(anders als andere Iterables wie Listen)

In [None]:
# strings are immutable hence assignment throws error
s[0] = 'X'

In [None]:
print('Original String id:', id(s))
# creates a new string
s = 'X' + s[1:]
print(s)
print('New String id:', id(s))

## Useful String methods

### Case conversions

In [32]:
s = 'python is great'
s.capitalize()

'Python is great'

In [33]:
s.upper()

'PYTHON IS GREAT'

In [34]:
s.title()

'Python Is Great'

### String replace

In [35]:
s.replace('python', 'NLP')

'NLP is great'

### Numeric checks

In [36]:
'12345'.isdecimal()

True

In [37]:
'apollo11'.isdecimal()

False

### Alphabet checks

In [38]:
'python'.isalpha()

True

In [39]:
'number1'.isalpha()

False

### Alphanumeric checks

In [40]:
'total'.isalnum()

True

In [41]:
'abc123'.isalnum()

True

In [42]:
'1+1'.isalnum()

False

### String splitting and joining

In [43]:
s = 'I,am,a,comma,separated,string'
s.split(',')

['I', 'am', 'a', 'comma', 'separated', 'string']

In [44]:
' '.join(s.split(','))

'I am a comma separated string'

## stripping whitespace characters

In [45]:
s = '   I am surrounded by spaces    '
s

'   I am surrounded by spaces    '

In [46]:
s.strip()

'I am surrounded by spaces'

In [47]:
sentences = 'Python is great. NLP is also good.'
sentences.split('.')

['Python is great', ' NLP is also good', '']

In [48]:
print('\n'.join(sentences.split('.')))

Python is great
 NLP is also good



In [49]:
print('\n'.join([sentence.strip()
                 for sentence in sentences.split('.')
                 if sentence]))

Python is great
NLP is also good


## String formatting

### Simple string formatting expressions - very old style

In [None]:
'Hello %s' % ('Python!')

In [None]:
'Hello %s %s' % ('World!', 'How are you?')

### Formatting expressions with different data types - very old style 

(C-like)

In [None]:
'We have %d %s containing %.2f gallons of %s' % (2, 'bottles', 2.5, 'milk')

In [None]:
'We have %d %s containing %.2f gallons of %s' % (5.21, 'jugs', 10.86763, 'juice')

### Formatting strings using the format method - old style

In [None]:
'Hello {} {}, it is a great {} to meet you at {}'.format('Mr.', 'Jones', 'pleasure', 5)

In [None]:
'Hello {} {}, it is a great {} to meet you at {} o\' clock'.format('Sir', 'Arthur', 'honor', 9)

### Alternative ways of using string format

In [None]:
'I have a {food_item} and a {drink_item} with me'.format(drink_item='soda', food_item='sandwich')

In [None]:
'The {animal} has the following attributes: {attributes}'.format(animal='dog', attributes=['lazy', 'loyal'])

### Neu: f-Strings

In [None]:
s_neu = f"2 + 2 ergibt {2+2}"
s_neu

In [None]:
f"Der String von eben war: {s_neu}"

## Regular Expressions

In [50]:
s1 = 'Python is an excellent language'
s2 = 'I love the Python language. I also use Python to build applications at work!'

In [51]:
import re

pattern = 'python'
# match only returns a match if regex match is found at the beginning of the string
re.match(pattern, s1)

In [52]:
# pattern is in lower case hence ignore case flag helps
# in matching same pattern with different cases
re.match(pattern, s1, flags=re.IGNORECASE)

<re.Match object; span=(0, 6), match='Python'>

In [53]:
# printing matched string and its indices in the original string
m = re.match(pattern, s1, flags=re.IGNORECASE)
print(f'Found match {m.group(0)} ranging from index {m.start()} - {m.end()} in the string "{s1}"')

Found match Python ranging from index 0 - 6 in the string "Python is an excellent language"


In [56]:
# match does not work when pattern is not there in the beginning of string s2
re.match(pattern, s2, re.IGNORECASE)

In [57]:
# illustrating find and search methods using the re module
re.search(pattern, s2, re.IGNORECASE)

<re.Match object; span=(11, 17), match='Python'>

In [58]:
re.findall(pattern, s2, re.IGNORECASE)

['Python', 'Python']

In [59]:
match_objs = re.finditer(pattern, s2, re.IGNORECASE)
match_objs

<callable_iterator at 0x136a3a00f10>

In [None]:
# next(match_objs)

In [60]:
print("String:", s2)
for m in match_objs:
    print(f'Found match "{m.group(0)}" ranging from index {m.start()} - {m.end()}')

String: I love the Python language. I also use Python to build applications at work!
Found match "Python" ranging from index 11 - 17
Found match "Python" ranging from index 39 - 45


In [61]:
# illustrating pattern substitution using sub and subn methods
re.sub(pattern, 'Java', s2, flags=re.IGNORECASE)

'I love the Java language. I also use Java to build applications at work!'

In [62]:
re.subn(pattern, 'Java', s2, flags=re.IGNORECASE)

('I love the Java language. I also use Java to build applications at work!', 2)

In [73]:
# dealing with unicode matching using regexes
s = u'H\u00e8llo! this is Python 🐍'
s

'Hèllo! this is Python 🐍'

In [74]:
re.findall(r'\w+', s)

['Hèllo', 'this', 'is', 'Python']

In [65]:
re.findall(r"[A-Z]\w+", s, re.UNICODE)

['Hèllo', 'Python']

In [66]:
emoji_pattern = r"['\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF']"
re.findall(emoji_pattern, s, re.UNICODE)

['🐍']