In [1]:
from google.colab import drive

drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
PATH = '/content/drive/MyDrive/NLPWorkShopANPAOct2021/'

### Text data is growing fast!

* Data continues to grow exponentially
  * Estimated to be 2.5 Exabytes(2.5 Million TB) a 
  * Grow to 40 Zetabytes (40 billion TB) by 2020 (50 times of 2021)

* Approximately 80% of all the data is estimated to be unstructured, text-rich data






### So, what can be done with text?

* Parse text
* Find/Identify/Extract relevant information from the text
* Classify text documents
* Search for relevant text documents
* Sentiment analysis
* Topic modeling
* ***



# Natural Language Processing

* Any computation, manipulation of natural language

## Natural Language Involves

* new words get added
* old words lose popularity
* meaning of words change
* language rules themselves may change

## NLP Tasks:

* Counting words, counting frequency of words
* Finding sentence boundaries
* Part of speech tagging
* Parsing the sentence structure
* Identifying semantic role
* Identifying entities in a sentence
* Finding which pronoun rerers to which entity



In [3]:
text1 = "Ethics are built right into the ideals and objectives of the United Nations "

# The length of text1
len(text1)


76

In [5]:
text1.lower()

'ethics are built right into the ideals and objectives of the united nations '

In [23]:
# Return a list of the words in text2, separating by ' '.
text2 = text1.split()
text2



['Ethics',
 'are',
 'built',
 'right',
 'into',
 'the',
 'ideals',
 'and',
 'objectives',
 'of',
 'the',
 'United',
 'Nations']

### List comprehension allows us to find specific words:

In [8]:
# Words that are greater than 3 letters long in text2
[w for w in text2 if len(w)>3]

['Ethics',
 'built',
 'right',
 'into',
 'ideals',
 'objectives',
 'United',
 'Nations']

In [9]:
 # Capitalized words in text2
 [w for w in text2 if w.istitle()]

['Ethics', 'United', 'Nations']

In [10]:
[w.lower() for w in text2]

['ethics',
 'are',
 'built',
 'right',
 'into',
 'the',
 'ideals',
 'and',
 'objectives',
 'of',
 'the',
 'united',
 'nations',
 '']

In [24]:
text3 = 'To be or not to be'
print(len(text3))
text4 = text3.split()
print(len(text4))

18
6


In [25]:
len(set(text4))

5

In [26]:
set(text4)


{'To', 'be', 'not', 'or', 'to'}

In [27]:
# .lower converts the string to lowercase.
text5=set([w.lower() for w in text4])
text5
print(len(text5))

4


In [32]:
text5 = '"Ethics are built right into the ideals and objectives of the United Nations" \
#UNSG @NY Society for Ethical Culture bit.ly/2guVelr'
text6 = text5.split()
text6

['"Ethics',
 'are',
 'built',
 'right',
 'into',
 'the',
 'ideals',
 'and',
 'objectives',
 'of',
 'the',
 'United',
 'Nations"',
 '#UNSG',
 '@NY',
 'Society',
 'for',
 'Ethical',
 'Culture',
 'bit.ly/2guVelr']

In [30]:
[w for w in text6 if w.startswith('#')]

['#UNSG']

In [33]:
[w for w in text6 if w.startswith('@')]

['@NY']

# NLP Basics: Learning how to use regular expressions(REGEX)

### Using regular expressions in Python

Python's `re` package is the most commonly used regex resource. More details can be found [here](https://docs.python.org/3/library/re.html).

We can use regular expressions to help us with more complex parsing.

For example '@[A-Za-z0-9_]+' will return all words that:

* start with '@' and are followed by at least one:
* capital letter ('A-Z')
* lowercase letter ('a-z')
* number ('0-9')
* or underscore ('_')

![Meta Characters](https://drive.google.com/uc?id=1mPwKJUQI0KsZ7h3T7wCWQDxRmYbb1M0f)

![MetaSymbols](https://drive.google.com/uc?id=1amUCUTF_C6YM08iXIPm35yFdWajLdEdx)

![MetaRepetitions](https://drive.google.com/uc?id=13CSdKKBOXNKTQXoutG0PUMaX0DwFXwBW)








In [36]:

text7 = '@UN @UN_Women "Ethics are built right into the ideals and objectives of the United Nations" \
#UNSG @ NY Society for Ethical Culture bit.ly/2guVelr'
text8 = text7.split()

In [40]:

import re 
[w for w in text8 if re.search('@[A-Za-z0-9_]+', w)]


['@UN', '@UN_Women']

In [41]:
#hastag words
[w for w in text8 if re.search('#[A-Za-z0-9_]+', w)]

['#UNSG']

In [42]:

re_test = 'This is a made up string to test 2 different regex methods'
re_test_messy = 'This      is a made up     string to test 2    different regex methods'
re_test_messy1 = 'This-is-a-made/up.string*to>>>>test----2""""""different~regex-methods'

### Splitting a sentence into a list of words

In [43]:
re.split('\s', re_test)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [44]:
re.split('\s', re_test_messy)

['This',
 '',
 '',
 '',
 '',
 '',
 'is',
 'a',
 'made',
 'up',
 '',
 '',
 '',
 '',
 'string',
 'to',
 'test',
 '2',
 '',
 '',
 '',
 'different',
 'regex',
 'methods']

In [45]:
re.split('\s+', re_test_messy)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [46]:
re.split('\s+', re_test_messy1)

['This-is-a-made/up.string*to>>>>test----2""""""different~regex-methods']

In [48]:
re.split('\W+', re_test_messy1)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [50]:
re.findall('\S+', re_test)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [51]:
re.findall('\S+', re_test_messy)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [52]:
re.findall('\S+', re_test_messy1)

['This-is-a-made/up.string*to>>>>test----2""""""different~regex-methods']

In [53]:
re.findall('\w+', re_test_messy1)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

### Replacing a specific string

In [54]:
pep8_test = 'I try to follow PEP8 guidelines'
pep7_test = 'I try to follow PEP7 guidelines'
peep8_test = 'I try to follow 8PEEP8 guidelines'

In [55]:
re.findall('[a-z]+', pep8_test)

['try', 'to', 'follow', 'guidelines']

In [56]:
re.findall('[A-Z]+[0-9]+', peep8_test)

['PEEP8']

In [57]:
re.findall('[0-9]+[A-Z]+[0-9]+', peep8_test)

['8PEEP8']

In [61]:
re.sub('[0-9]+[A-Z]+[0-9]+', 'PEP8 Python Styleguide', peep8_test)

'I try to follow PEP8 Python Styleguide guidelines'

### Other examples of regex methods

- re.search()
- re.match()
- re.fullmatch()
- re.finditer()
- re.escape()

In [None]:
texts = ["(800)-800-1111 call for free goodies",
         "(800) 800-1111 call for free goodies",
         "8008001111 call for free goodies",
         "800-800-1111 call for free goodies",
        ]

In [None]:
for text in texts:
  phones = re.findall('\(\d{3}\D{0,3}\d{3}\D{0,3}\d{4}', text)
  print(text, phones)

(800)-800-1111 call for free goodies ['(800)-800-1111']
(800) 800-1111 call for free goodies ['(800) 800-1111']
8008001111 call for free goodies []
800-800-1111 call for free goodies []


In [None]:
for text in texts:
  phone_regex = re.compile(r"(?:\+ *)?\d[\d\- ]{7,}\d")
  phone = phone_regex.findall(text)
  print(text, phone)

(800)-800-1111 call for free goodies []
(800) 800-1111 call for free goodies []
8008001111 call for free goodies ['8008001111']
800-800-1111 call for free goodies ['800-800-1111']


# Working with Text Data in pandas

In [62]:
import pandas as pd

time_sentences = ["Monday: The doctor's appointment is at 2:45pm.", 
                  "Tuesday: The dentist's appointment is at 11:30 am.",
                  "Wednesday: At 7:00pm, there is a basketball game!",
                  "Thursday: Be back home by 11:15 pm at the latest.",
                  "Friday: Take the train at 08:10 am, arrive at 09:00am."]

df = pd.DataFrame(time_sentences, columns=['text'])
df

Unnamed: 0,text
0,Monday: The doctor's appointment is at 2:45pm.
1,Tuesday: The dentist's appointment is at 11:30...
2,"Wednesday: At 7:00pm, there is a basketball game!"
3,Thursday: Be back home by 11:15 pm at the latest.
4,"Friday: Take the train at 08:10 am, arrive at ..."


In [63]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [64]:
tweet = "@nltk Text analysis is awesome! #regex #pandas #python"

print([word for word in tweet.split() if word.startswith('#')])

['#regex', '#pandas', '#python']


In [65]:
# find the number of characters for each string in df['text']
df['length']=df['text'].str.len()

In [66]:
df.head()

Unnamed: 0,text,length
0,Monday: The doctor's appointment is at 2:45pm.,46
1,Tuesday: The dentist's appointment is at 11:30...,50
2,"Wednesday: At 7:00pm, there is a basketball game!",49
3,Thursday: Be back home by 11:15 pm at the latest.,49
4,"Friday: Take the train at 08:10 am, arrive at ...",54


In [71]:
import string
#punctuation percent

def count_punct(text):
  count = sum([len(char) for char in text if char in string.punctuation])
  return round(count/(len(text)-text.count(" ")), 3)*100

df['punct%']=df['text'].apply(count_punct)

df.head()




Unnamed: 0,text,length,punct%
0,Monday: The doctor's appointment is at 2:45pm.,46,10.0
1,Tuesday: The dentist's appointment is at 11:30...,50,9.3
2,"Wednesday: At 7:00pm, there is a basketball game!",49,9.5
3,Thursday: Be back home by 11:15 pm at the latest.,49,7.5
4,"Friday: Take the train at 08:10 am, arrive at ...",54,11.1


In [74]:
# find the number of tokens for each string in df['text']
df['no_of_tokens']=df['text'].str.split().str.len()
df.head()

Unnamed: 0,text,length,punct%,no_of_tokens
0,Monday: The doctor's appointment is at 2:45pm.,46,10.0,7
1,Tuesday: The dentist's appointment is at 11:30...,50,9.3,8
2,"Wednesday: At 7:00pm, there is a basketball game!",49,9.5,8
3,Thursday: Be back home by 11:15 pm at the latest.,49,7.5,10
4,"Friday: Take the train at 08:10 am, arrive at ...",54,11.1,10


In [75]:

# find which entries contain the word 'appointment'
df['text'].str.contains('appointment')

0     True
1     True
2    False
3    False
4    False
Name: text, dtype: bool

In [76]:
# find how many times a digit occurs in each string
df['text'].str.count(r'\d')

0    3
1    4
2    3
3    4
4    8
Name: text, dtype: int64

In [78]:

# find all occurances of the digits
df['text'].str.findall(r'\d')

0                   [2, 4, 5]
1                [1, 1, 3, 0]
2                   [7, 0, 0]
3                [1, 1, 1, 5]
4    [0, 8, 1, 0, 0, 9, 0, 0]
Name: text, dtype: object

In [79]:
# replace weekdays with '???'
df['text'].str.replace(r'\w+day\b','???')

0          ???: The doctor's appointment is at 2:45pm.
1       ???: The dentist's appointment is at 11:30 am.
2          ???: At 7:00pm, there is a basketball game!
3         ???: Be back home by 11:15 pm at the latest.
4    ???: Take the train at 08:10 am, arrive at 09:...
Name: text, dtype: object

In [None]:
# replace weekdays with 3 letter abbrevations


In [None]:
# create new columns from first match of extracted groups


In [None]:
# extract the entire time, the hours, the minutes, and the period


In [None]:
# extract the entire time, the hours, the minutes, and the period with group names
