# More on Strings
#### For more information refer to the <a href = 'https://docs.python.org/3.7/library/string.html'>Python Documentation</a>

In [50]:
import math
import pandas as pd

pi = math.pi

In [3]:
pi

3.141592653589793

In [4]:
f'pi is equal to {pi}'

'pi is equal to 3.141592653589793'

In [5]:
f'pi is equal to {pi:.3f}'

'pi is equal to 3.142'

In [6]:
f'pi is equal to {pi:12.3f}'

'pi is equal to        3.142'

In [7]:
f'pi is equal to {pi:012.3f}'

'pi is equal to 00000003.142'

In [8]:
f'pi is equal to {pi:+.3f}'

'pi is equal to +3.142'

In [9]:
f'pi is equal to {pi:.3f}'

'pi is equal to 3.142'

In [10]:
f'pi is equal to {pi:10.3f}'

'pi is equal to      3.142'

In [11]:
f'pi is equal to {pi:<10.3f}'

'pi is equal to 3.142     '

In [12]:
f'pi is equal to {pi:^10.3f}'

'pi is equal to   3.142   '

In [13]:
f'pi is equal to {pi:=+10.3f}'

'pi is equal to +    3.142'

## Regular Expressions<br>
<b> For more information, go to <a href ='https://docs.python.org/3/howto/regex.html'> Python Docs </a>

<b>[aeiou]</b>	Match any vowel <br>
<b>[^aeiou]	</b> ^ inverts selection, this matches any consonant<br>
<b>[a-z]</b> Match any lowercase leter from a-z<br>
<b>\d</b> Matches any decimal digit; this is equivalent to the class [0-9]<br>
<b>\D</b> Matches any non-digit character; this is equivalent to the class [^0-9]<br>
<b>\s</b> Matches any whitespace character; this is equivalent to the class [ \t\n\r\f\v]<br>
<b>\S</b> Matches any non-whitespace character; this is equivalent to the class [^ \t\n\r\f\v]<br>
<b>\w</b> Matches any alphanumeric character; this is equivalent to the class [a-zA-Z0-9_]<br>
<b>\W</b> Matches any non-alphanumeric character; this is equivalent to the class [^a-zA-Z0-9_]

## Repetition 
<b>?</b> Match 0 or 1 of the preceeding group<br>
<b>*</b> Match 0 or more of the preceeding group<br>
<b>+</b> Match 1 or more of the preceeding group<br>
<b>{n}</b> Match exactly n of the preceeding group<br>
<b>^string</b> String must begin with <i>string</i><br>
<b>string$ </b> String must end with <i>string</i><br>

## re module in python 
### We first have to import the module and then compile a pattern

In [180]:
import re
pattern = re.compile(r'giles')
pattern

re.compile(r'giles', re.UNICODE)

In [182]:
pattern.match('giles').group()

'giles'

In [183]:
me = pattern.match('giles')
me.group()

'giles'

In [179]:
me_2 = pattern.match('hello giles')
me_2.group()

AttributeError: 'NoneType' object has no attribute 'group'

In [189]:
pattern = re.compile(r"\s(\w+)")

In [190]:
me_3 = pattern.search('hello giles')

In [192]:
me_3.group(1)

'giles'

In [256]:
pattern_2 = re.compile(r'g\D+')

In [254]:
pattern_2 = re.compile(r'g\w+')

In [258]:
pattern_2.match('giles1234')

<re.Match object; span=(0, 5), match='giles'>

In [23]:
string = '''image01.jpg,image01.png,image02.jpg,my_doc.doc,my_doc2.docx,london_bridge.gif, another_doc.doc,my_pdf.pdf'''
string1 = '''123.34 m3,23 m3,231.0 m3'''

In [24]:
pattern_3 = re.compile(r'(\w+)\.(jpg|png|gif)')

In [25]:
pattern_4 = re.compile(r'(\d+)\.?(\d*)\s(?=m3)')

In [26]:
for m in re.finditer(pattern_4, string1):
    print(m.group())

123.34 
23 
231.0 


In [27]:
for m in re.finditer(pattern_3, string):
    print(m.group())

image01.jpg
image01.png
image02.jpg
london_bridge.gif


# Type hints
For more information see <a href='https://docs.python.org/3/library/typing.html'>Python Docs</a>

In [28]:
def hello_1(name):
    return f'Hello {name}'

### With type hints

In [29]:
def hello_2(name: str) -> str:
    return f'Hello {name}'

In [30]:
hello_1('Giles')

'Hello Giles'

In [31]:
hello_2('Giles')

'Hello Giles'

In [32]:
list(range(2,5))

[2, 3, 4]

In [33]:
mylist = [['abc'], ['def', 'ghi']]
mylist[-1][-1][-1]

'i'

In [34]:
def eur_to_usd(euros, rate=0.8):
    return euros * rate
print(eur_to_usd(10))

8.0


In [37]:
weight = input("How many kg?")
price = weight * 2.5
print(price)

How many kg?1


TypeError: can't multiply sequence by non-int of type 'float'

In [38]:
for letter in 'abc':
    print(letter.upper())

A
B
C


In [39]:
for item in [[1, 2, 3], [4, 5, 6]]:
    print(item[0])

1
4


In [40]:
def foo(x):
    return x ** 2
print(foo("Hello"))

TypeError: unsupported operand type(s) for ** or pow(): 'str' and 'int'

In [41]:
def foo(a = 1, b = 'John'):
    return a + b
foo()

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [None]:
with open('articles.txt', 'w') as file:
        file.write('wiki_telephones')
 
with open('articles.txt', 'w') as file:
        file.write('wikipedia_telephones')
        file.readline()

In [None]:
 
with open('articles.txt', 'r') as file:
        print(file.readline())

### Regular Expression Tutorial

In [None]:
import re

In [70]:
a = "\tHello"
print(a)

b = r"\tHello"  # raw string
print(b)

	Hello
\tHello


In [86]:
test_string = "123abc456789abcABC"
pattern = re.compile(r"abc")

matches = re.finditer(pattern,test_string)

for match in matches :
    print(match)

<re.Match object; span=(3, 6), match='abc'>
<re.Match object; span=(12, 15), match='abc'>


In [88]:
matches = pattern.findall(test_string)
print(matches[0])

abc


### match(), search(), findall(), finditer()

In [73]:
# match only beginning of string
match = pattern.match(test_string)
print(match)

None


In [74]:
pattern_1 = re.compile(r"123")
match = pattern_1.match(test_string)
print(match)

<re.Match object; span=(0, 3), match='123'>


In [75]:
# search only first match
pattern_2 = re.compile(r"abc")
match = pattern_2.search(test_string)
print(match)

<re.Match object; span=(3, 6), match='abc'>


In [76]:
pattern = re.compile(r"abc")
matches = pattern.finditer(test_string)

# group, start, end, span
for match in matches :
    print(match.span(),match.start(),match.end())
    print(match.group())

(3, 6) 3 6
abc
(12, 15) 12 15
abc


### All meta characters: . ^ $ * + ? { } [ ] \ | ( )


In [77]:
# . Any character (except newline character) 
# ^ Starts with "^hello"
# $ Ends with "world\$"
# * Zero or more occurrences "aix*"
# + One or more occurrences "aix+"
# { } Exactly the specified number of occurrences "al{2}"
# [] A set of characters "[a-m]"
# \ Special sequence (or escape special characters) "\d"
# | Either of "fall|stays"
# () Capture and group

In [78]:
pattern = re.compile(r".")
matches = pattern.finditer(test_string)

# group, start, end, span
for match in matches :
#     print(match.span(),match.start(),match.end())
    print(match.group())

1
2
3
a
b
c
4
5
6
7
8
9
a
b
c
A
B
C


In [79]:
test_string = "123abc456789abcABC."
pattern = re.compile(r"\.")
pattern_1 = re.compile(r"^abc")
matches = pattern.finditer(test_string)
matches1 = pattern_1.finditer(test_string)
# group, start, end, span
for match in matches :
    print(match.group())
for match in matches1 :
    print(match.group())

.


In [80]:
test_string = "123abc456789abcABCsnv"
pattern = re.compile(r"snv$")

matches = pattern.finditer(test_string)
for match in matches :
    print(match)

<re.Match object; span=(18, 21), match='snv'>


<b>[aeiou]</b>	Match any vowel <br>
<b>[^aeiou]	</b> ^ inverts selection, this matches any consonant<br>
<b>[a-z]</b> Match any lowercase leter from a-z<br>
<b>\d</b> Matches any decimal digit; this is equivalent to the class [0-9]<br>
<b>\D</b> Matches any non-digit character; this is equivalent to the class [^0-9]<br>
<b>\s</b> Matches any whitespace character; this is equivalent to the class [ \t\n\r\f\v]<br>
<b>\S</b> Matches any non-whitespace character; this is equivalent to the class [^ \t\n\r\f\v]<br>
<b>\w</b> Matches any alphanumeric character; this is equivalent to the class [a-zA-Z0-9_]<br>
<b>\W</b> Matches any non-alphanumeric character; this is equivalent to the class [^a-zA-Z0-9_]

In [81]:
test_string = 'hello 123_ heyho hohey hoha'

pattern = re.compile(r'\bho')
matches = pattern.finditer(test_string)
for match in matches :
    print(match)

<re.Match object; span=(17, 19), match='ho'>
<re.Match object; span=(23, 25), match='ho'>


In [82]:
test_string = 'HELLOhello 123_'
# it's not looking for 'lo'
pattern = re.compile(r'[a-zA-Z]') # [a-z]
matches = pattern.finditer(test_string)
for match in matches :
    print(match)

<re.Match object; span=(0, 1), match='H'>
<re.Match object; span=(1, 2), match='E'>
<re.Match object; span=(2, 3), match='L'>
<re.Match object; span=(3, 4), match='L'>
<re.Match object; span=(4, 5), match='O'>
<re.Match object; span=(5, 6), match='h'>
<re.Match object; span=(6, 7), match='e'>
<re.Match object; span=(7, 8), match='l'>
<re.Match object; span=(8, 9), match='l'>
<re.Match object; span=(9, 10), match='o'>


### Quantifier:

<b>*</b> : 0 or more <br>
<b>+</b> : 1 or more <br>
<b>?</b> : 0 or 1, -> optional character <br>
<b>{4}</b> : exact number <br>
<b>{4,6}</b> : range numbers (min,max) <br>

In [None]:
test_string = 'hello_123'
# it's not looking for 'lo'
pattern = re.compile(r'\d+')
matches = pattern.finditer(test_string)
for match in matches :
    print(match)

In [42]:
date = """
01.04.2020
2020.04.03

2020-04-01
2020-05-23
2020-06-11
2020-07-11
2020-08-11

2020/04/02

2020_04_04
2020_o4_05

"""
pattern = re.compile(r'\d\d\d\d[-/]\d\d[-/]\d\d')
matches = pattern.finditer(date)

for match in matches :
    print(match)

<re.Match object; span=(24, 34), match='2020-04-01'>
<re.Match object; span=(35, 45), match='2020-05-23'>
<re.Match object; span=(46, 56), match='2020-06-11'>
<re.Match object; span=(57, 67), match='2020-07-11'>
<re.Match object; span=(68, 78), match='2020-08-11'>
<re.Match object; span=(80, 90), match='2020/04/02'>


In [43]:
pattern = re.compile(r'\d{4}[-/]0[567][-/]\d{2}')
matches = pattern.finditer(date)

for match in matches :
    print(match)

<re.Match object; span=(35, 45), match='2020-05-23'>
<re.Match object; span=(46, 56), match='2020-06-11'>
<re.Match object; span=(57, 67), match='2020-07-11'>


### Conditions

In [44]:
my_string = """
hello world
1223
2020-05-20
Mr Simpson
Mrs Simpson
Mr. Brown
Ms Smith
Mr. T
pythonengineer@gmail.com
Python-engineer@gmx.de
python-engineer123@my-domain.org
"""

#pattern = re.compile(r'(Mr|Ms|Mrs)\.?\s\w+')
#pattern = re.compile(r'M\w+\.?\s\w+')
pattern = re.compile(r'([a-zA-Z0-9-]+)@([a-zA-Z-]+)\.([a-zA-Z]+)')
matches = pattern.finditer(my_string)

# Using group method
for match in matches :
#     print(match.group(0))
#     print(match.group(1))
#      print(match.group(2))
     print(match.group(3))

com
de
org


### Modifications 

In [45]:
test_string = "123abc456789abc123ABC"
test_string1 = 'hello world, you are the best world'
# split, sub
pattern = re.compile(r'abc')
sub_pattern = re.compile(r"world")
splitted = pattern.split(test_string)
subbed_string = sub_pattern.sub("planet",test_string1)
print(subbed_string)
# print(splitted)




hello planet, you are the best planet


In [46]:
print(splitted)

['123', '456789', '123ABC']


### Combine together

In [47]:
urls = """

http://python-engineer.com
https://www.python-engineer.com
http://www.pyeng.net
http://www.facebook.com

"""

pattern = re.compile(r"https?://(www\.)?([a-zA-Z-]+)(\.[a-zA-Z]+)")
matches = pattern.finditer(urls)
for match in matches:
    print(match.group(3))
    
subbed_urls = pattern.sub(r"\2\3",urls)
print(subbed_urls)

.com
.com
.net
.com


python-engineer.com
python-engineer.com
pyeng.net
facebook.com




### Compilation Flags:

In [48]:
my_string= "Hello World"
pattern = re.compile(r"hel",re.IGNORECASE)
matches = pattern.finditer(my_string)

for match in matches :
    print(match)

<re.Match object; span=(0, 3), match='Hel'>


### Using Regex with pandas

In [52]:
summer = pd.read_csv('summer.csv')
summer

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver
...,...,...,...,...,...,...,...,...,...
31160,2012,London,Wrestling,Wrestling Freestyle,"JANIKOWSKI, Damian",POL,Men,Wg 84 KG,Bronze
31161,2012,London,Wrestling,Wrestling Freestyle,"REZAEI, Ghasem Gholamreza",IRI,Men,Wg 96 KG,Gold
31162,2012,London,Wrestling,Wrestling Freestyle,"TOTROV, Rustam",RUS,Men,Wg 96 KG,Silver
31163,2012,London,Wrestling,Wrestling Freestyle,"ALEKSANYAN, Artur",ARM,Men,Wg 96 KG,Bronze


In [245]:
wrestling = summer[summer.Sport=='Wrestling']
wrestling

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
148,1896,Athens,Wrestling,Wrestling Gre-R,"CHRISTOPOULOS, Stephanos",GRE,Men,Open Event,Bronze
149,1896,Athens,Wrestling,Wrestling Gre-R,"SCHUMANN, Carl",GER,Men,Open Event,Gold
150,1896,Athens,Wrestling,Wrestling Gre-R,"TSITAS, Georgios",GRE,Men,Open Event,Silver
1112,1904,St Louis,Wrestling,Wrestling Free.,"THIEFENTHALER, Gustav",USA,Men,- 47.6KG (Light-Flyweight),Bronze
1113,1904,St Louis,Wrestling,Wrestling Free.,"CURRY, Robert",USA,Men,- 47.6KG (Light-Flyweight),Gold
...,...,...,...,...,...,...,...,...,...
31160,2012,London,Wrestling,Wrestling Freestyle,"JANIKOWSKI, Damian",POL,Men,Wg 84 KG,Bronze
31161,2012,London,Wrestling,Wrestling Freestyle,"REZAEI, Ghasem Gholamreza",IRI,Men,Wg 96 KG,Gold
31162,2012,London,Wrestling,Wrestling Freestyle,"TOTROV, Rustam",RUS,Men,Wg 96 KG,Silver
31163,2012,London,Wrestling,Wrestling Freestyle,"ALEKSANYAN, Artur",ARM,Men,Wg 96 KG,Bronze


In [237]:
data = {'Description':['Made payment on 20/03/2020','Meeting with clients (21/09/2020)','Christmas party on 24/12/2020','Valentine day is 14/02/2021','14/06/2020 is a hot day in Thailand'],
       'value':[200,100,150,250,300]}
df = pd.DataFrame(data)
df

Unnamed: 0,Description,value
0,Made payment on 20/03/2020,200
1,Meeting with clients (21/09/2020),100
2,Christmas party on 24/12/2020,150
3,Valentine day is 14/02/2021,250
4,14/06/2020 is a hot day in Thailand,300


In [284]:
df_copy = df.copy()
df_copy.loc[:,'Date'] = None
df_copy

Unnamed: 0,Description,value,Date
0,Made payment on 20/03/2020,200,
1,Meeting with clients (21/09/2020),100,
2,Christmas party on 24/12/2020,150,
3,Valentine day is 14/02/2021,250,
4,14/06/2020 is a hot day in Thailand,300,


In [234]:
# df_copy.loc[5] = ['Happy Mother Day',10,None]
# df_copy

Unnamed: 0,Description,value,Date
0,Made payment on 20/03/2020,200,
1,Meeting with clients (21/09/2020),100,
2,Christmas party on 24/12/2020,150,
3,Valentine day is 14/02/2021,250,
4,14/06/2020 is a hot day in Thailand,300,
5,Happy Mother Day,10,


In [285]:
index_description = df_copy.columns.get_loc('Description')
index_date = df_copy.columns.get_loc('Date')
print(index_description,index_date)

0 2


In [262]:
pattern = re.compile(r'\d{2}\/\d{2}\/\d{4}')
for row in range(0,len(df_copy)):
    date = pattern.search(df_copy.iat[row,index_description]).group()
    df_copy.iat[row,index_date] = date
    
df_copy

Unnamed: 0,Description,value,Date
0,Made payment on 20/03/2020,200,20/03/2020
1,Meeting with clients (21/09/2020),100,21/09/2020
2,Christmas party on 24/12/2020,150,24/12/2020
3,Valentine day is 14/02/2021,250,14/02/2021
4,14/06/2020 is a hot day in Thailand,300,14/06/2020


In [286]:
pattern = r'\d{2}\/(\d{2})\/(\d{4})'
for row in range(0,len(df_copy)) :
    date = re.search(pattern,df_copy.iat[row,index_description]).group(2)
    df_copy.iat[row,index_date] = date
    


In [287]:
df_copy

Unnamed: 0,Description,value,Date
0,Made payment on 20/03/2020,200,2020
1,Meeting with clients (21/09/2020),100,2020
2,Christmas party on 24/12/2020,150,2020
3,Valentine day is 14/02/2021,250,2021
4,14/06/2020 is a hot day in Thailand,300,2020


In [282]:
pattern_1 = re.compile(r'\d{2}\/\d{2}\/(\d{4})')
a = []
for row in df_copy.test :
    subed = pattern_1.sub(r'\1',row)
    a.append(int(subed))
df_copy['Year'] = a
df_copy


Unnamed: 0,Description,value,Date,test,Year
0,Made payment on 20/03/2020,200,20/03/2020,20/03/2020,2020
1,Meeting with clients (21/09/2020),100,21/09/2020,21/09/2020,2020
2,Christmas party on 24/12/2020,150,24/12/2020,24/12/2020,2020
3,Valentine day is 14/02/2021,250,14/02/2021,14/02/2021,2021
4,14/06/2020 is a hot day in Thailand,300,14/06/2020,14/06/2020,2020


In [283]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Description  5 non-null      object
 1   value        5 non-null      int64 
 2   Date         5 non-null      object
 3   test         5 non-null      object
 4   Year         5 non-null      int64 
dtypes: int64(2), object(3)
memory usage: 328.0+ bytes


In [266]:
wl_copy = wrestling.copy()
new_wrestling = wl_copy[wl_copy.Event.str.contains(r"\w+\s(\d+)\sKG",case=False)]
new_wrestling

  return func(self, *args, **kwargs)


Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
31097,2012,London,Wrestling,Wrestling Freestyle,"OBARA, Hitomi",JPN,Women,Wf 48 KG,Gold
31098,2012,London,Wrestling,Wrestling Freestyle,"STADNYK, Mariya",AZE,Women,Wf 48 KG,Silver
31099,2012,London,Wrestling,Wrestling Freestyle,"CHUN, Clarissa Kyoko Mei Ling",USA,Women,Wf 48 KG,Bronze
31100,2012,London,Wrestling,Wrestling Freestyle,"HUYNH, Carol",CAN,Women,Wf 48 KG,Bronze
31101,2012,London,Wrestling,Wrestling Freestyle,"OTARSULTANOV, Dzhamal",RUS,Men,Wf 55 KG,Gold
...,...,...,...,...,...,...,...,...,...
31160,2012,London,Wrestling,Wrestling Freestyle,"JANIKOWSKI, Damian",POL,Men,Wg 84 KG,Bronze
31161,2012,London,Wrestling,Wrestling Freestyle,"REZAEI, Ghasem Gholamreza",IRI,Men,Wg 96 KG,Gold
31162,2012,London,Wrestling,Wrestling Freestyle,"TOTROV, Rustam",RUS,Men,Wg 96 KG,Silver
31163,2012,London,Wrestling,Wrestling Freestyle,"ALEKSANYAN, Artur",ARM,Men,Wg 96 KG,Bronze


In [260]:
pattern = re.compile(r"\w+\s(\d+)\sKG")
new_wrestling['New Event1'] = new_wrestling.Event.apply(lambda x : pattern.findall(x)[0])
new_wrestling

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_wrestling['New Event1'] = new_wrestling.Event.apply(lambda x : pattern.findall(x)[0])


Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal,New Event,New Event1
31097,2012,London,Wrestling,Wrestling Freestyle,"OBARA, Hitomi",JPN,Women,Wf 48 KG,Gold,48,48
31098,2012,London,Wrestling,Wrestling Freestyle,"STADNYK, Mariya",AZE,Women,Wf 48 KG,Silver,48,48
31099,2012,London,Wrestling,Wrestling Freestyle,"CHUN, Clarissa Kyoko Mei Ling",USA,Women,Wf 48 KG,Bronze,48,48
31100,2012,London,Wrestling,Wrestling Freestyle,"HUYNH, Carol",CAN,Women,Wf 48 KG,Bronze,48,48
31101,2012,London,Wrestling,Wrestling Freestyle,"OTARSULTANOV, Dzhamal",RUS,Men,Wf 55 KG,Gold,55,55
...,...,...,...,...,...,...,...,...,...,...,...
31160,2012,London,Wrestling,Wrestling Freestyle,"JANIKOWSKI, Damian",POL,Men,Wg 84 KG,Bronze,84,84
31161,2012,London,Wrestling,Wrestling Freestyle,"REZAEI, Ghasem Gholamreza",IRI,Men,Wg 96 KG,Gold,96,96
31162,2012,London,Wrestling,Wrestling Freestyle,"TOTROV, Rustam",RUS,Men,Wg 96 KG,Silver,96,96
31163,2012,London,Wrestling,Wrestling Freestyle,"ALEKSANYAN, Artur",ARM,Men,Wg 96 KG,Bronze,96,96
