# Regular expressions

Reference

1. parsing log files - https://remejy.com/method-to-parse-a-tomcat-access-log-file-with-python-regex/

In [1]:
import re
import pandas as pd

In [2]:
text = 'Around 2,500 patients are taking part in clinical trails #Coronavirus'

### FIND operation

**Extracting single characters, only lower case**


In [4]:

re.findall('[a-z]', text)[:10] # Find all lower case characters

['r', 'o', 'u', 'n', 'd', 'p', 'a', 't', 'i', 'e']

**Extracting group of characters or words, only lower case**

In [5]:
print(text)
re.findall('[a-z]+', text) # Find all sequency of lower case characters

Around 2,500 patients are taking part in clinical trails #Coronavirus


['round',
 'patients',
 'are',
 'taking',
 'part',
 'in',
 'clinical',
 'trails',
 'oronavirus']

**Extracting group of characters or words, both lower & upper case words**

In [6]:
print(text)
re.findall('[a-zA-Z]+', text) # Find all sequency of lower & upper case characters

Around 2,500 patients are taking part in clinical trails #Coronavirus


['Around',
 'patients',
 'are',
 'taking',
 'part',
 'in',
 'clinical',
 'trails',
 'Coronavirus']

**Extracting words(numbers/alphabets-lower as well as upper case/underscores) sequence**

In [8]:
print(text)
re.findall('\w+', text) # Find all sequency of word characters

Around 2,500 patients are taking part in clinical trails #Coronavirus


['Around',
 '2',
 '500',
 'patients',
 'are',
 'taking',
 'part',
 'in',
 'clinical',
 'trails',
 'Coronavirus']

### SUBSTITUTE operation

**Replace anything which is NOT WORD sequence(i.e. number/alphabets/underscore) with nothing - here whitespace also replaced**<br>
NOT speacified using carat^ symbol.

In [9]:
print(text)
re.sub('[^\w+]', '', text)

Around 2,500 patients are taking part in clinical trails #Coronavirus


'Around2500patientsaretakingpartinclinicaltrailsCoronavirus'

**Replace anything which is NOT WORD sequence (i.e. number/alphabets/underscore) and space, with nothing - here we retain the whitespace**

In [10]:
print(text)
re.sub('[^\w+\s]', '', text)

Around 2,500 patients are taking part in clinical trails #Coronavirus


'Around 2500 patients are taking part in clinical trails Coronavirus'

## Extracting hashtags

**Extract any word sequence(alphabets/numbers/underscores) that starts with a # symbol**

In [11]:
print(text)
print(re.findall('#\w+', text))

Around 2,500 patients are taking part in clinical trails #Coronavirus
['#Coronavirus']


In [28]:
tweets = pd.read_csv('/Users/sylvia/Desktop/datasets/tweets_donald_trump.csv')
print(tweets.shape)
tweets.head()

(400, 5)


Unnamed: 0,created_at,language,likes,retweets,text
0,2020-06-17 03:27:56,en,123212.0,18568.0,96% Approval Rating in the Republican Party. T...
1,2020-06-17 02:45:33,und,0.0,7942.0,RT @TONYxTWO: @thejtlewis @JoeBiden https://t....
2,2020-06-17 02:38:20,en,0.0,23815.0,RT @thejtlewis: “Trump isn’t going to accept t...
3,2020-06-17 02:37:01,en,0.0,6781.0,"RT @thejtlewis: With the utmost respect, I tha..."
4,2020-06-17 02:31:11,en,56840.0,14231.0,A GREAT woman. Her son is looking down from he...


In [29]:
tweets['text'].apply(lambda v: re.findall('#\w+', v)).head(50)
                                                          

0                        []
1                        []
2                        []
3                        []
4                        []
5                        []
6                        []
7                        []
8                        []
9                        []
10                       []
11                       []
12                       []
13                       []
14                       []
15                       []
16                       []
17                       []
18                       []
19                       []
20                       []
21                       []
22                       []
23                       []
24                       []
25                       []
26                       []
27                       []
28                       []
29                       []
30                       []
31                       []
32                       []
33                       []
34                       []
35                  

In [30]:
all_hashtags = []
for row in tweets['text'].head(10):
    row_hashtags = re.findall('#\w+', row)
    print(row_hashtags)

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


**Find all hashtags in all rows & store in a list**

In [31]:
all_hashtags = []
for row in tweets['text']:
    row_hashtags = re.findall('#\w+', row)
    all_hashtags.extend(row_hashtags)

In [32]:
len(all_hashtags)

23

In [33]:
all_hashtags[:10]

['#ArmyBda',
 '#HappyBirthdayTrump',
 '#GoArmy',
 '#USMA2020',
 '#WithVisionWeLead',
 '#JUSTI',
 '#BarelyThereBiden',
 '#MAGA',
 '#NHSEN',
 '#2A']

**Check how many times hashtag repeated in data using value counts**

In [27]:
freq_hashtags = pd.Series(all_hashtags).value_counts()
freq_hashtags.head(10)

#MAGA            4
#2A              2
#NH01            1
#FoxNews         1
#Democrats       1
#PPPworks        1
#JobsReport      1
#jobsreport      1
#AMERICAFirst    1
#WGDP            1
dtype: int64

## Groups

In [34]:
jobs = pd.read_csv('/Users/sylvia/Desktop/datasets/datascience_jobs.csv')
jobs.head(2)

Unnamed: 0,title,location,experience,skills,company,salary,description,posted_date
0,Data Science,Mumbai,2-4 yrs,"Algorithms, Machine Learning, Python, Java, Da...",Netcore Solutions Pvt Ltd,"2,00,000 - 7,00,000 P.A.",At least 2 year of experience in data engineer...,1 day ago
1,Analyst / Sr. Analyst (data Science),Gurgaon,5-8 yrs,"predictive modeling, predictive analytics, mac...",Cvent India Pvt. Ltd.,"5,00,000 - 10,00,000 P.A.",Strong experience on providing predictive mode...,Today


In [37]:
jobs['salary'].unique()

array(['    2,00,000 - 7,00,000 P.A.  ',
       '    5,00,000 - 10,00,000 P.A.  ',
       '    10,00,000 - 15,00,000 P.A.  ', '  Not disclosed ',
       '    2,00,000 - 4,25,000 P.A.  ', '    4,00,000 - 8,00,000 P.A.  ',
       '    1,00,000 - 3,00,000 P.A.  ', '    1,25,000 - 3,00,000 P.A.  ',
       '    1,25,000 - 4,00,000 P.A. Best salary as per Industry & ...  ',
       '    2,50,000 - 7,50,000 P.A.  ', '    4,00,000 - 7,50,000 P.A.  ',
       '    22,50,000 - 35,00,000 P.A.  ',
       '    12,00,000 - 18,00,000 P.A.  ',
       '    13,00,000 - 20,00,000 P.A.  ',
       '    3,00,000 - 8,00,000 P.A. 40%  ',
       '    8,00,000 - 10,00,000 P.A.  ',
       '    1,00,000 - 4,00,000 P.A.  ', '    50,000 - 2,00,000 P.A.  ',
       '    7,00,000 - 17,00,000 P.A.  ',
       '    6,00,000 - 12,00,000 P.A.  ',
       '    10,00,000 - 20,00,000 P.A.  ', '    Rs. 1.50 Lac/month  ',
       '    PB-3 Rs15600-39100 with minimum pay of Rs30000 + AGP of...  ',
       '    Fellowship amount is Rs

**Extract salary range**

In [38]:
salary = '5,00,000 - 10,00,000 P.A.'

# replacing commas with space before passing to findall regex
re.findall('[0-9]+ - [0-9]+ P.A.', salary.replace(',', ''))


['500000 - 1000000 P.A.']

**Extract minimum number/feature from range using grouping**

In [40]:
re.findall('([0-9]+) - [0-9]+ P.A.', salary.replace(',', ''))

['500000']

**Extract maximum number/feature from range using grouping**

In [41]:
re.findall('[0-9]+ - ([0-9]+) P.A.', salary.replace(',', ''))

['1000000']

**Extract salary from whole dataset**

In [42]:
def get_salary_min(row):
    row = row.replace(',', '')
    pattern = '([0-9]+) - [0-9]+ P.A.'
    salary = re.findall(pattern, row)
    if len(salary):
        return salary[0]
    else:
        return None

In [43]:
jobs['salary_min'] = jobs['salary'].apply(get_salary_min)
jobs[['salary', 'salary_min']].head()

Unnamed: 0,salary,salary_min
0,"2,00,000 - 7,00,000 P.A.",200000.0
1,"5,00,000 - 10,00,000 P.A.",500000.0
2,"10,00,000 - 15,00,000 P.A.",1000000.0
3,Not disclosed,
4,Not disclosed,


**Convert Salary from String datatype to numeric column so mathematical operations can be performed**

In [45]:
jobs['salary_min'] = pd.to_numeric(jobs['salary_min'],
                                  errors='coerce')

# Using errors='coerce'. It will replace all non-numeric values with NaN
jobs['salary_min']

0        200000.0
1        500000.0
2       1000000.0
3             NaN
4             NaN
          ...    
3995          NaN
3996          NaN
3997          NaN
3998          NaN
3999          NaN
Name: salary_min, Length: 4000, dtype: float64