## Pandas with regex

In [1]:
import pandas as pd
time_sentence = ["Sunday: I start from home for work around 9:15am", 
                "Monday: Usual break time is around 1:35pm", 
                "Tuesday: Weekly meeting is around 3:25pm", 
                "Wednesday: Zoom or teams meeting around 2:40pm", 
                "Thursday: Site meetings at 10:30am as well as progress meeting 2:30pm"]
                

In [2]:
df = pd.DataFrame(time_sentence, columns = ['text'])
df

Unnamed: 0,text
0,Sunday: I start from home for work around 9:15am
1,Monday: Usual break time is around 1:35pm
2,Tuesday: Weekly meeting is around 3:25pm
3,Wednesday: Zoom or teams meeting around 2:40pm
4,Thursday: Site meetings at 10:30am as well as ...


In [3]:
df['text'].str.len()

0    48
1    41
2    40
3    46
4    69
Name: text, dtype: int64

In [4]:
df['text'].str.split().str.len()

0     9
1     7
2     6
3     7
4    11
Name: text, dtype: int64

### 1. Find string containing a word

In [5]:
df['text'].str.contains('meeting')

0    False
1    False
2     True
3     True
4     True
Name: text, dtype: bool

In [6]:
df['text'].str.count(r'\d')

0    3
1    3
2    3
3    3
4    7
Name: text, dtype: int64

In [7]:
df['text'].str.findall(r'\d')

0                [9, 1, 5]
1                [1, 3, 5]
2                [3, 2, 5]
3                [2, 4, 0]
4    [1, 0, 3, 0, 2, 3, 0]
Name: text, dtype: object

### 2. Find time strings

In [8]:
df['text'].str.findall(r'(\d?\d):(\d?\d)')

0              [(9, 15)]
1              [(1, 35)]
2              [(3, 25)]
3              [(2, 40)]
4    [(10, 30), (2, 30)]
Name: text, dtype: object

In [9]:
df['text'].str.replace(r'\w+day\b', '???')

0        ???: I start from home for work around 9:15am
1               ???: Usual break time is around 1:35pm
2                 ???: Weekly meeting is around 3:25pm
3             ???: Zoom or teams meeting around 2:40pm
4    ???: Site meetings at 10:30am as well as progr...
Name: text, dtype: object

### 3. Abbreviate weekday names

In [10]:
df['text'].str.replace(r'(\w+day\b)', lambda x: x.groups()[0][:3])

0        Sun: I start from home for work around 9:15am
1               Mon: Usual break time is around 1:35pm
2                 Tue: Weekly meeting is around 3:25pm
3             Wed: Zoom or teams meeting around 2:40pm
4    Thu: Site meetings at 10:30am as well as progr...
Name: text, dtype: object

In [11]:
df['text'].str.extract(r'(\d?\d):(\d?\d)')

Unnamed: 0,0,1
0,9,15
1,1,35
2,3,25
3,2,40
4,10,30


In [12]:
df['text'].str.extractall(r'((\d?\d):(\d?\d)?([ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,9:15am,9,15,am
1,0,1:35pm,1,35,pm
2,0,3:25pm,3,25,pm
3,0,2:40pm,2,40,pm
4,0,10:30am,10,30,am
4,1,2:30pm,2,30,pm
