# Python f√ºr NLP

Angelehnt an D. Sarkar: Text Analytics with Python (2nd Edition)

Erg√§nzt und aktualisiert von Heiko R√∂lke

## Arbeiten mit Zeichenketten (Strings)

In [None]:
new_string = "This is a String"  # storing a string

print('ID:', id(new_string))  # shows the object identifier (address)
print('Type:', type(new_string))  # shows the object type
print('Value:', new_string)  # shows the object value

In [None]:
# simple string
simple_string = 'Hello!' + " I'm a simple string"
print(simple_string)

In [None]:
# multi-line string, note the \n (newline) escape character automatically created
multi_line_string = """Hello I'm
a multi-line
string!"""

multi_line_string

In [None]:
print(multi_line_string)

In [None]:
# Normal string with escape sequences leading to a wrong file path!
escaped_string = "C:\the_folder\new_dir\file.txt"
print(escaped_string)  # will cause errors if we try to open a file here

In [None]:
# raw string keeping the backslashes in its normal form
raw_string = r'C:\the_folder\new_dir\file.txt'
print(raw_string)

In [None]:
# unicode string literals
string_with_unicode = 'H\u00e8llo!'
print(string_with_unicode)

In [None]:
more_unicode = 'I love Pizza üçï!  Shall we book a cab üöï to get pizza?'
print(more_unicode)

In [None]:
print(string_with_unicode + '\n' + more_unicode)

In [None]:
' '.join([string_with_unicode, more_unicode])

## Operationen mit Zeichenketten

### Verkettung

In [None]:
'Hello üòä' + ' and welcome ' + 'to Python üêç!'

In [None]:
'Hello üòä' ' and welcome ' 'to Python üêç!'

In [None]:
# concatenation of variables and literals
s1 = 'Python üíª!'
'Hello üòä ' + s1

In [None]:
'Hello üòä ' s1

In [None]:
# some more ways of concatenating strings
s2 = '--üêçPythonüêç--'
s2 * 5

In [None]:
s1 + s2
(s1 + s2) * 3

In [None]:
# concatenating several strings together in parentheses
s3 = ('This '
      'is another way '
      'to concatenate '
      'several strings!')
s3

In [None]:
# checking for substrings in a string
'way' in s3

In [None]:
'python' in s3

In [None]:
# computing total length of the string
len(s3)

## Indexing and Slicing

In [None]:
s = 'PYTHON'
# depicting string indexes
for index, character in enumerate(s):
    print('Character ->', character, 'has index->', index)

In [None]:
s[0], s[1], s[2], s[3], s[4], s[5]

In [None]:
s[-1], s[-2], s[-3], s[-4], s[-5], s[-6]

In [None]:
## String slicing
s[:]

In [None]:
s[1:4]

In [None]:
s[:3], s[3:]

In [None]:
s[-3:]

In [None]:
s[:3] + s[3:]

In [None]:
s[:3] + s[-3:]

### String slicing with offsets

In [None]:
s[::1]  # no offset

In [None]:
s[::2]  # print every 2nd character in string

In [None]:
s[::-1]  # reverses the string

### Zeichenketten sind unver√§nderlich

(anders als andere Iterables wie Listen)

In [None]:
# strings are immutable hence assignment throws error
s[0] = 'X'

In [None]:
print('Original String id:', id(s))
# creates a new string
s = 'X' + s[1:]
print(s)
print('New String id:', id(s))

## Useful String methods

### Case conversions

In [None]:
s = 'python is great'
s.capitalize()

In [None]:
s.upper()

In [None]:
s.title()

### String replace

In [None]:
s.replace('python', 'NLP')

### Numeric checks

In [None]:
'12345'.isdecimal()

In [None]:
'apollo11'.isdecimal()

### Alphabet checks

In [None]:
'python'.isalpha()

In [None]:
'number1'.isalpha()

### Alphanumeric checks

In [None]:
'total'.isalnum()

In [None]:
'abc123'.isalnum()

In [None]:
'1+1'.isalnum()

### String splitting and joining

In [None]:
s = 'I,am,a,comma,separated,string'
s.split(',')

In [None]:
' '.join(s.split(','))

## stripping whitespace characters

In [None]:
s = '   I am surrounded by spaces    '
s

In [None]:
s.strip()

In [None]:
sentences = 'Python is great. NLP is also good.'
sentences.split('.')

In [None]:
print('\n'.join(sentences.split('.')))

In [None]:
print('\n'.join([sentence.strip()
                 for sentence in sentences.split('.')
                 if sentence]))

## String formatting

### Simple string formatting expressions - very old style

In [None]:
'Hello %s' % ('Python!')

In [None]:
'Hello %s %s' % ('World!', 'How are you?')

### Formatting expressions with different data types - very old style 

(C-like)

In [None]:
'We have %d %s containing %.2f gallons of %s' % (2, 'bottles', 2.5, 'milk')

In [None]:
'We have %d %s containing %.2f gallons of %s' % (5.21, 'jugs', 10.86763, 'juice')

### Formatting strings using the format method - old style

In [None]:
'Hello {} {}, it is a great {} to meet you at {}'.format('Mr.', 'Jones', 'pleasure', 5)

In [None]:
'Hello {} {}, it is a great {} to meet you at {} o\' clock'.format('Sir', 'Arthur', 'honor', 9)

### Alternative ways of using string format

In [None]:
'I have a {food_item} and a {drink_item} with me'.format(drink_item='soda', food_item='sandwich')

In [None]:
'The {animal} has the following attributes: {attributes}'.format(animal='dog', attributes=['lazy', 'loyal'])

### Neu: f-Strings

In [None]:
s_neu = f"2 + 2 ergibt {2+2}"
s_neu

In [None]:
f"Der String von eben war: {s_neu}"

## Regular Expressions

In [None]:
s1 = 'Python is an excellent language'
s2 = 'I love the Python language. I also use Python to build applications at work!'

In [None]:
import re

pattern = 'python'
# match only returns a match if regex match is found at the beginning of the string
re.match(pattern, s1)

In [None]:
# pattern is in lower case hence ignore case flag helps
# in matching same pattern with different cases
re.match(pattern, s1, flags=re.IGNORECASE)

In [None]:
# printing matched string and its indices in the original string
m = re.match(pattern, s1, flags=re.IGNORECASE)
print(f'Found match {m.group(0)} ranging from index {m.start()} - {m.end()} in the string "{s1}"')

In [None]:
# match does not work when pattern is not there in the beginning of string s2
re.match(pattern, s2, re.IGNORECASE)

In [None]:
# illustrating find and search methods using the re module
re.search(pattern, s2, re.IGNORECASE)

In [None]:
re.findall(pattern, s2, re.IGNORECASE)

In [None]:
match_objs = re.finditer(pattern, s2, re.IGNORECASE)
match_objs

In [None]:
# next(match_objs)

In [None]:
print("String:", s2)
for m in match_objs:
    print(f'Found match "{m.group(0)}" ranging from index {m.start()} - {m.end()}')

In [None]:
# illustrating pattern substitution using sub and subn methods
re.sub(pattern, 'Java', s2, flags=re.IGNORECASE)

In [None]:
re.subn(pattern, 'Java', s2, flags=re.IGNORECASE)

In [None]:
# dealing with unicode matching using regexes
s = u'H\u00e8llo! this is Python üêç'
s

In [None]:
re.findall(r'\w+', s)

In [None]:
re.findall(r"[A-Z]\w+", s, re.UNICODE)

In [None]:
emoji_pattern = r"['\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF']"
re.findall(emoji_pattern, s, re.UNICODE)