# Ch 6. Strings

* Understanding strings as sequences of characters
* Using basic string operations
* Inserting special characters and escape sequences
* Converting from objects to strings
* Formatting strings
* Using the byte type

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## 6.1 Strings as sequences of characters

In [2]:
x = "Hello"
x[0]
x[-1]
x[1:]

'H'

'o'

'ello'

In [3]:
x = "Goodbye\n"
x = x[:-1]
x

'Goodbye'

In [4]:
len("Goodbye")

7

In [5]:
x = "Strings are immutable."
x[-1] = "!"

TypeError: 'str' object does not support item assignment

## 6.2 Basic string operations

In [6]:
x = "Hello " + "World"
x

'Hello World'

In [7]:
8*"S"
"S"*8

'SSSSSSSS'

'SSSSSSSS'

## 6.3 Special characters and escape sequences

In [8]:
print("Single quote \'")
print("Double quote \"")
print("Backslash \\")
print("Bell \a bel?")
print("Backspace\b character")
print("Formfeed \f What's this?")
print("Newline \n I know this!")
print("Carriage-return \r test!")
print("Tab\tthis is a tab\tsee?")
print("Vertical tab? \vIs this spaced?")

Single quote '
Double quote "
Backslash \
Bell  bel?
Backspace character
Formfeed  What's this?
Newline 
 I know this!
Carriage-return  test!
Tab	this is a tab	see?
Vertical tab? Is this spaced?


In [9]:
'm'
'\155'
'\x6D'

'm'

'm'

'm'

In [10]:
'\n'
'\012'
'\x0A'

'\n'

'\n'

'\n'

In [11]:
unicode_a = '\N{LATIN SMALL LETTER A}'
unicode_a
unicode_a_with_acute = '\N{LATIN SMALL LETTER A WITH ACUTE}'
unicode_a_with_acute
"\u00E1"

'a'

'á'

'á'

### 6.3.3 Printing vs. evaluating strings with special characters

In [12]:
'a\n\tb'
print('a\n\tb')

'a\n\tb'

a
	b


In [13]:
print("abc\n")
print("abc\n", end="")
print("test")

abc

abc
test


## 6.4 String methods

### 6.4.1 The split and join string methods

In [14]:
" ".join(["join", "puts", "spaces", "between", "elements"])

'join puts spaces between elements'

In [15]:
"::".join(["Separated", "with", "colons"])

'Separated::with::colons'

In [16]:
"".join(["Separated", "by", "nothing"])

'Separatedbynothing'

In [17]:
x = "You\t\t can have tabs\t\n \t and newlines \n\n" \
    "mixed in"
print(x)
x.split()

You		 can have tabs	
 	 and newlines 

mixed in


['You', 'can', 'have', 'tabs', 'and', 'newlines', 'mixed', 'in']

In [18]:
x  = "Mississippi"
x.split("ss")

['Mi', 'i', 'ippi']

In [19]:
x = 'a b c d'
x.split(' ', 1)
x.split(' ', 2)
x.split(' ', 9)

['a', 'b c d']

['a', 'b', 'c d']

['a', 'b', 'c', 'd']

In [20]:
x = "You\t\t can have tabs\t\n \t and newlines \n\n" \
    "mixed in"
print(x)
x.split()
x.split(None, 1)
x.split(None, 2)
x.split('\t')

You		 can have tabs	
 	 and newlines 

mixed in


['You', 'can', 'have', 'tabs', 'and', 'newlines', 'mixed', 'in']

['You', 'can have tabs\t\n \t and newlines \n\nmixed in']

['You', 'can', 'have tabs\t\n \t and newlines \n\nmixed in']

['You', '', ' can have tabs', '\n ', ' and newlines \n\nmixed in']

In [21]:
x = 'this is a test'
'-'.join(x.split())
# Think Use '-' to join (stuff)
# Think x split by (spaces)

'this-is-a-test'

### 6.4.2 Converting strings to numbers

In [22]:
float('123.456')
float('xxyy')

123.456

ValueError: could not convert string to float: 'xxyy'

In [23]:
int('3333')
int('123.456')

3333

ValueError: invalid literal for int() with base 10: '123.456'

In [24]:
int('10000', 8) #octal
int('101', 2) #binary
int('ff', 16) #hex
int('123456', 6) #sexidecimal

4096

5

255

ValueError: invalid literal for int() with base 6: '123456'

In [25]:
int('a1') #fail because a is not base 10 by default

ValueError: invalid literal for int() with base 10: 'a1'

In [26]:
int('12G', 16) #fail because G is not in hex

ValueError: invalid literal for int() with base 16: '12G'

In [27]:
float("12345678901234567890") # should work

1.2345678901234567e+19

In [28]:
int("12*2") # should fail because * is not base 10

ValueError: invalid literal for int() with base 10: '12*2'

In [29]:
int("12") * int("2") #should return 24 though

24

### 6.4.3 Getting rid of extra whitespace

In [30]:
x = "  Hello,     World\t\t"
x
x.strip()
x.lstrip()
x.rstrip()

'  Hello,     World\t\t'

'Hello,     World'

'Hello,     World\t\t'

'  Hello,     World'

In [31]:
import string
string.whitespace

' \t\n\r\x0b\x0c'

In [32]:
' \t\n\r\v\f'

' \t\n\r\x0b\x0c'

In [33]:
x = 'www.python.org'
x.strip('w')
x.strip("gor")
x.strip(".gorw")

'.python.org'

'www.python.'

'python'

In [34]:
x = "(name, date), \n"
x.lstrip("(").rstrip("), \n")
x.rstrip("),")
x.strip("),\n")
x.strip("\n)(,")

'name, date'

'(name, date), \n'

'(name, date), '

'name, date), '

### 6.4.4 String searching

In [35]:
x = "Mississippi"
x.find("ss") # returns index of first instance start
x.find("zz")

2

-1

In [36]:
x.find("ss", 3)
x.find("ss", 0, 3)
x.find("ss", 0, 4)

5

-1

2

In [37]:
x.rfind("ss")
x.rfind("i", 3)
x.rfind("i", 0, 3)
x.rfind("i", 0, 4)
x.rfind("i", 0, 5)

5

10

1

1

4

In [38]:
x.index("i")
x.index("i", 0, 1)

1

ValueError: substring not found

In [39]:
x.count("ss")
x.count("i")
x.count("i", 2, 6)

2

4

1

In [40]:
x = "Mississippi"
x.startswith("Miss")
x.startswith("Mist")
x.endswith("pi")
x.endswith("p")

True

False

True

False

In [41]:
x.endswith(("i", "u"))
x.endswith(["i", "u"])

True

TypeError: endswith first arg must be str or a tuple of str, not list

In [42]:
x = "random line that does not with rejected end"
y = "this one ends with rejected"
z = "this and multiple rejected rejected"

x.endswith("rejected")
y.endswith("rejected")
z.endswith("rejected")

False

True

True

In [43]:
x[-len("rejected"):] == "rejected"
y[-len("rejected"):] == "rejected"
z[-len("rejected"):] == "rejected"

False

True

True

In [44]:
x.split()[-1] == "rejected"
y.split()[-1] == "rejected"
z.split()[-1] == "rejected"

False

True

True

In [45]:
x.rfind("rejected") == len(x) - len("rejected")
y.rfind("rejected") == len(y) - len("rejected")
z.rfind("rejected") == len(z) - len("rejected")

False

True

True

### Modifying strings

In [46]:
x = "Mississippi"
x.replace("ss", "+++")

'Mi+++i+++ippi'

In [47]:
x = "~x ^ (y % z)"
table = x.maketrans("~^()", "!&[]")
x
x.translate(table)

'~x ^ (y % z)'

'!x & [y % z]'

In [48]:
x = "hello, World!"
x
x.lower()
x.upper()
x.capitalize()
x.title()
x.swapcase()

'hello, World!'

'hello, world!'

'HELLO, WORLD!'

'Hello, world!'

'Hello, World!'

'HELLO, wORLD!'

In [49]:
x = "This\thas\ttabs."
x
print(x)
x.expandtabs(8)

'This\thas\ttabs.'

This	has	tabs.


'This    has     tabs.'

In [50]:
x = "cats"
x.ljust(20)
x.rjust(20)
x.center(20)

'cats                '

'                cats'

'        cats        '

In [51]:
x = "8"
x.zfill(9)

'000000008'

### 6.4.6 Modifying strings with list manipulations

In [52]:
text = "Hello, World"
print(text)
wordList = list(text)
print(wordList)
wordList[6:] = []
wordList.reverse()
print(wordList)
text = "".join(wordList)
print(text)

Hello, World
['H', 'e', 'l', 'l', 'o', ',', ' ', 'W', 'o', 'r', 'l', 'd']
[',', 'o', 'l', 'l', 'e', 'H']
,olleH


In [53]:
x = "check.that.punctuation.turns.to,spaces:"
puncts = ",.?;:'\""
print(puncts)
table = x.maketrans(puncts, " "*len(puncts))
x
x.translate(table)

,.?;:'"


'check.that.punctuation.turns.to,spaces:'

'check that punctuation turns to spaces '

### 6.4.7 Useful methods and constants

In [54]:
x = "123"
print(x)
x.isdigit()
x.isalpha()
x = "M"
print(x)
x.islower()
x.isupper()

123


True

False

M


False

True

In [55]:
print(string.digits)
print(string.hexdigits)
print(string.octdigits)

0123456789
0123456789abcdefABCDEF
01234567


In [56]:
print(string.ascii_lowercase)
print(string.ascii_uppercase)
print(string.ascii_letters)

abcdefghijklmnopqrstuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ


In [57]:
x = ['"abc"', 'def', '"ghi"', '"klm"', 'nop']
x

['"abc"', 'def', '"ghi"', '"klm"', 'nop']

In [58]:
x_new = []
for w in x:
    x_new.append(w.strip('"'))
x_new

['abc', 'def', 'ghi', 'klm', 'nop']

In [59]:
x = "Mississippi"
x = x[:x.rfind('p')]+ x[x.rfind('p')+1 :]

In [60]:
x = "Mississippi"
x.replace("pp", "p")

'Mississipi'

## 6.5 Converting from objects to strings

In [61]:
print([1, 2, 3])
repr([1, 2, 3])
x = [1]
x.append(2)
x.append([3, 4])
'the list x is ' + repr(x)
print('the list x is ', x)

[1, 2, 3]


'[1, 2, 3]'

'the list x is [1, 2, [3, 4]]'

the list x is  [1, 2, [3, 4]]


In [62]:
repr(len)
repr(list)
repr(max)
repr(tuple)

'<built-in function len>'

"<class 'list'>"

'<built-in function max>'

"<class 'tuple'>"

In [63]:
str(len)
str(list)
str(max)
str(tuple)

'<built-in function len>'

"<class 'list'>"

'<built-in function max>'

"<class 'tuple'>"

In [64]:
str(x)

'[1, 2, [3, 4]]'

## 6.6 Using the format method

In [65]:
print(f'{{ is a single one }}')

{ is a single one }


In [66]:
"{0} is the {1} of {2}".format("Ambrosia", "food", "the gods")

'Ambrosia is the food of the gods'

In [67]:
"{{Ambrosia}} is the {0} of {1}".format("food", "the gods")

'{Ambrosia} is the food of the gods'

In [68]:
"{food} is the food of {user}".format(food="Ambrosia", user="the gods")

'Ambrosia is the food of the gods'

In [69]:
"{0} is the food of {user[1]}".format("Ambrosia", user=["men", "the gods", "others"])

'Ambrosia is the food of the gods'

In [70]:
"{0:5} is the food of gods".format("Ambrosia")
"{0:10} is the food of gods".format("Ambrosia")
"{0:11} is the food of gods".format("Ambrosia")
"{0:12} is the food of gods".format("Ambrosia")
"{0:13} is the food of gods".format("Ambrosia")

'Ambrosia is the food of gods'

'Ambrosia   is the food of gods'

'Ambrosia    is the food of gods'

'Ambrosia     is the food of gods'

'Ambrosia      is the food of gods'

In [71]:
"{0:{1}} is the food of gods".format("Ambrosia", 5)
"{0:{1}} is the food of gods".format("Ambrosia", 10)
"{0:{1}} is the food of gods".format("Ambrosia", 11)
"{0:{1}} is the food of gods".format("Ambrosia", 12)
"{0:{1}} is the food of gods".format("Ambrosia", 13)

'Ambrosia is the food of gods'

'Ambrosia   is the food of gods'

'Ambrosia    is the food of gods'

'Ambrosia     is the food of gods'

'Ambrosia      is the food of gods'

In [72]:
"{food:{width}} is the food of gods".format(food="Ambrosia", width=10)

'Ambrosia   is the food of gods'

In [73]:
"{0:>5} is the food of gods".format("Ambrosia")
"{0:>10} is the food of gods".format("Ambrosia")
"{0:>11} is the food of gods".format("Ambrosia")
"{0:>12} is the food of gods".format("Ambrosia")
"{0:>13} is the food of gods".format("Ambrosia")

'Ambrosia is the food of gods'

'  Ambrosia is the food of gods'

'   Ambrosia is the food of gods'

'    Ambrosia is the food of gods'

'     Ambrosia is the food of gods'

In [74]:
"{0:&>5} is the food of gods".format("Ambrosia")
"{0:&>10} is the food of gods".format("Ambrosia")
"{0:&>11} is the food of gods".format("Ambrosia")
"{0:&>12} is the food of gods".format("Ambrosia")
"{0:&>13} is the food of gods".format("Ambrosia")

'Ambrosia is the food of gods'

'&&Ambrosia is the food of gods'

'&&&Ambrosia is the food of gods'

'&&&&Ambrosia is the food of gods'

'&&&&&Ambrosia is the food of gods'

In [75]:
x = "{1:{0}}!".format(3,4)
print("Expecting\n4  !")
print(x)

x = "{1:{0}}!".format(3,"4")
print("Expecting\n4  !")
print(x)

Expecting
4  !
  4!
Expecting
4  !
4  !


In [76]:
x = "{0:$>5}!".format(3)
print("Expecting\n$$$$3!")
print(x)

Expecting
$$$$3!
$$$$3!


In [77]:
x = "{a:{b}}!".format(a=1, b=5)
print("Expecting\n1    !")
print(x)
x = "{a:{b}}!".format(a="1", b=5)
print("Expecting\n1    !")
print(x)

Expecting
1    !
    1!
Expecting
1    !
1    !


In [78]:
x = "{a:{b}}:{0:$>5}!".format(3, 4, a=1, b=5, c=10)
print("Expecting\n1    :$$$$3!")
print(x)

x = "{a:{b}}:{0:$>5}!".format(3, 4, a="1", b=5, c=10)
print("Expecting\n1    :$$$$3!")
print(x)

Expecting
1    :$$$$3!
    1:$$$$3!
Expecting
1    :$$$$3!
1    :$$$$3!


## 6.7 Formatting strings with %

In [79]:
"%s is the %s of %s" % ("Ambrosia", "food", "the gods")
# string % tuple

'Ambrosia is the food of the gods'

In [80]:
"%s is the %s of %s" % ("Nectar", "drink", "gods")
"%s is the %s of %s" % ("Brussels Sprouts", "food", "foolish")

'Nectar is the drink of gods'

'Brussels Sprouts is the food of foolish'

In [81]:
x = [1, 2, "three"]
"the %s contains: %s" % ("list", x)

"the list contains: [1, 2, 'three']"

In [82]:
"Pi is <%-6.2f>" % 3.14159
"Pi is <%6.2f>" % 3.14159
"Pi is <%s>" % 3.14159

'Pi is <3.14  >'

'Pi is <  3.14>'

'Pi is <3.14159>'

In [83]:
num_dict = {'e': 2.718, 'pi': 3.14159}
print("%(pi).2f - %(pi).4f - %(e).2f" % num_dict)

3.14 - 3.1416 - 2.72


In [84]:
print("a")
print("b")
print("a", "b", "c")
print("a", "b", "c", sep="|")
print("a", "b", "c", end="\n\n")
print("a", "b", "c", file=open("testfile.txt", "w"))

a
b
a b c
a|b|c
a b c



In [85]:
x="%.2f" % 1.1111
print("Expect\n1.11")
print(x)

Expect
1.11
1.11


In [86]:
x="%(a).2f" % {'a': 1.1111}
print("Expect\n1.11")
print(x)

Expect
1.11
1.11


In [87]:
x="%(a).08f" % {'a': 1.1111}
print("Expect\n1.11110000")
print(x)

Expect
1.11110000
1.11110000


## 6.8 String interpolation

In [88]:
value = 42
message = f"The answer is {value}"
print(message)

The answer is 42


In [89]:
pi = 3.14159
print(f"pi is {pi:{10}.{2}}")

pi is        3.1


## 6.9 Bytes

In [90]:
unicode_a_with_acute = '\N{LATIN SMALL LETTER A WITH ACUTE}'
unicode_a_with_acute

'á'

In [91]:
xb = unicode_a_with_acute.encode()
xb

b'\xc3\xa1'

In [92]:
xb += 'A'

TypeError: can't concat str to bytes

In [93]:
xb.decode()

'á'

In [94]:
unicode_a_with_acute = '\N{LATIN SMALL LETTER A WITH ACUTE}'
unicode_a_with_acute += 'A'
unicode_a_with_acute
xb = unicode_a_with_acute.encode()
xb
xb.decode()

'áA'

b'\xc3\xa1A'

'áA'

## Lab 6: Preprocessing text

In [13]:
with open ("../qpbe3e/exercise_answers/moby_01.txt") as infile, open ("moby_01_clean.txt", "w") as outfile:
    
    puncts = "!.,:;-?'\"\n"
    punct_table = str.maketrans(puncts, " "*len(puncts))
    
    for line in infile:
        # make all one case
        lower_line = line.lower()
        
        # remove punctuation
        clean_line = lower_line.translate(punct_table)
        
        # split into words
        cleaned_words = clean_line.split()
        cleaned_words = "\n".join(cleaned_words)
        
        # write all words for line
        print(cleaned_words)
        outfile.write(cleaned_words)
        outfile.write("\n") # if this is missing, stuff gets written on the same line with no spaces

call
me
ishmael
some
years
ago
never
mind
how
long
precisely
having
little
or
no
money
in
my
purse
and
nothing
particular
to
interest
me
on
shore
i
thought
i
would
sail
about
a
little
and
see
the
watery
part
of
the
world
it
is
a
way
i
have
of
driving
off
the
spleen
and
regulating
the
circulation
whenever
i
find
myself
growing
grim
about
the
mouth
whenever
it
is
a
damp
drizzly
november
in
my
soul
whenever
i
find
myself
involuntarily
pausing
before
coffin
warehouses
and
bringing
up
the
rear
of
every
funeral
i
meet
and
especially
whenever
my
hypos
get
such
an
upper
hand
of
me
that
it
requires
a
strong
moral
principle
to
prevent
me
from
deliberately
stepping
into
the
street
and
methodically
knocking
people
s
hats
off
then
i
account
it
high
time
to
get
to
sea
as
soon
as
i
can
this
is
my
substitute
for
pistol
and
ball
with
a
philosophical
flourish
cato
throws
himself
upon
his
sword
i
quietly
take
to
the
ship
there
is
nothing
surprising
in
this
if
they
but
knew
it
almost
all
men
in
their
degr

In [5]:
help(str.maketrans)

Help on built-in function maketrans:

maketrans(x, y=None, z=None, /)
    Return a translation table usable for str.translate().
    
    If there is only one argument, it must be a dictionary mapping Unicode
    ordinals (integers) or characters to Unicode ordinals, strings or None.
    Character keys will be then converted to ordinals.
    If there are two arguments, they must be strings of equal length, and
    in the resulting dictionary, each character in x will be mapped to the
    character at the same position in y. If there is a third argument, it
    must be a string, whose characters will be mapped to None in the result.

