# Getting started with regular expressions

## Splitting spaces

In [3]:
import re

# splitting means if more than one space
# \s means a whitespace character: [ \t\n\x0B\f\r]

a = "this \tis \tsome \ttext        with\n\n lots of spaces"
b = "             this line\t has some \t space in the front"
print("this is string a: ", a, "\n")
print("this is string b: ", b, "\n")

clean_a = re.split('\s+', a)
print("This is string a after splitting the spaces:\n", clean_a)

clean_b = re.split('\s+', b)
print("\nNotice that string b has some space in the front. Notice that there is an empty string in the 0th index\n", clean_b)

cleaner_b = b.split()
print("\n\nJust using split with no arguments", cleaner_b)


this is string a:  this 	is 	some 	text        with

 lots of spaces 

this is string b:               this line	 has some 	 space in the front 

This is string a after splitting the spaces:
 ['this', 'is', 'some', 'text', 'with', 'lots', 'of', 'spaces']

Notice that string b has some space in the front. Notice that there is an empty string in the 0th index
 ['', 'this', 'line', 'has', 'some', 'space', 'in', 'the', 'front']


Just using split with no arguments ['this', 'line', 'has', 'some', 'space', 'in', 'the', 'front']


## decimal numbers

In [5]:
one = "Some of the numbers of e are: 2.7182818284590452353 the first few of pi are: 3.1415926535"
print("This is string one: ", one, "\n")

num_one = re.findall('\d+', one)
print("\nUsing the regex '\d+' all the digits found in the string are put int a list.\nNotice that the decimal was also taken out. We have to be careful of this when\ndealing with regexes and decimal numbers\n", num_one)

with_dec = re.findall('\d\.\d+', one)
print("\nUsing the regex '\d\.\d+' we were able to get the correct values we wanted as you can see here:\n", with_dec)




This is string one:  Some of the numbers of e are: 2.7182818284590452353 the first few of pi are: 3.1415926535 


Using the regex '\d+' all the digits found in the string are put int a list.
Notice that the decimal was also taken out. We have to be careful of this when
dealing with regexes and decimal numbers
 ['2', '7182818284590452353', '3', '1415926535']

Using the regex '\d\.\d+' we were able to get the correct values we wanted as you can see here:
 ['2.7182818284590452353', '3.1415926535']


## Finding words

In [6]:
letters1 = "The quick brown fox jumps over the lazy dog"
letters2="PACK MY BOX WITH FIVE DOZEN LIQUOR JUGS."
pattern = "a a a a a b b b b c c c d d e"

find_a = re.search('a', pattern)
print("\nUsing the search function lets us find only the first occurence of a matching\n", find_a[0])

find_all_a = re.findall('a+', pattern)
print("\nUsing the findall function, wecan find all occurrences of the letter a\n",find_all_a)

find_some_a = re.search('qui.{2}', letters1)
print("\nIn the string, find a part where it starts with 'qui'and give us the next two letters\n", find_some_a[0])

pattern2 = r'(?i)liquor'
y = re.search(pattern2, letters2)
print("\nYou can search without having to consider cases\n", y[0])

z = re.search(r'q.*.x', letters1)
print("\nFinding a string based on what we want in the middle\n",z[0])


Using the search function lets us find only the first occurence of a matching
 a

Using the findall function, wecan find all occurrences of the letter a
 ['a', 'a', 'a', 'a', 'a']

In the string, find a part where it starts with 'qui'and give us the next two letters
 quick

You can search without having to consider cases
 LIQUOR

Finding a string based on what we want in the middle
 quick brown fox


# More in-depth tutorial

In [9]:
# code for getting data from the google sheet found at the following link
# https://socraticowl.com/post/integrate-google-sheets-and-jupyter-notebooks/
import pandas as pd
import gspread
from oauth2client.service_account import ServiceAccountCredentials
    

In [13]:
scope = ['https://spreadsheets.google.com/feeds']
credentials = ServiceAccountCredentials.from_json_keyfile_name('./cs184_presentation.json')
gc = gspread.authorize(credentials)

FileNotFoundError: [Errno 2] No such file or directory: './cs184_presentation.json'

In [None]:
spreadsheet_key = '155KiZXYhOaPJkmg-KdoYOW497xMX13itjix3rkESjkg'
book = gc.open_by_key(spreadsheet_key)
worksheet = book.worksheet("Sheet1")
table = worksheet.get_all_values()
table

worksheet2 = book.worksheet("area_codes")
table2 = worksheet2.get_all_values()
table2

##Convert table data into a dataframe
df = pd.DataFrame(table[1:], columns=table[0])
df2 = pd.DataFrame(table2[1:], columns=table2[0])

#print out the first five rows of the dataframe
#df.head()

df

# Cleaning values

In [None]:
#Don't forget to import re for regular expressions. It's kinda the point of this tutorial 
import re

#Column First: just a name
#Column Unclean values: The first character is an ascii value between 33 and 126, so it's
#   still readable characters like "!@#$%^&*()<>" but it also includes alphabetic and numeric values.

#case 1: weird characters (luckily) only showed up at the beginning or end
#If we know our result should look like xxx.xxxxxx like how floats look, then weird characters
#can show up before or after the decimal with numbers we care able either preceding or succeeding it

one_way = []
two_way = []
three_way = []
four_way = []

# 1 or more digits followed by one decimal (you need the \ as an escape character)
# followed by one or more digits after it
one = r'[0-9]+\.[0-9]*'

# the \d means the set of digits, A.K.A [0-9]. Again, we have the \. to represent a .
# followed by at least one digit
two = r'\d+\.\d*'

# ^ this carrot means "the start of the regular expression"
# \D is basically (not)\d, so it catches everything that isn't a digit like weird characters,
# punctuation, etc. This regular expression finds a non-numeric character at the beginning of the
# expression
three = r'^\D'

for row in df['Unclean values']:
    # Search for the regex we defined in variable "one" in the string defined by our row variable
    # get the 0th element in the resulting match object, cast it as a float and append to the
    # one_way list.
    # note: this works here because there's only one thing we need to search for -the first and only
    # unclean value
    one_way.append(re.search(one, row)[0])
#     print(re.search(one, row)[0])
    
    # Similar to what we've been doing in assignments, find all instances where the string defined
    # by our row variable matches the regex we defined in variable "two." Take the 0th element of this
    # resulting list, cast it as a float and append it to the two_way list
    two_way.append(re.findall(two, row)[0])
#     print(re.findall(two, row)[0])
    
    # Remember the variable "three" has the regex '^\D'. Using the sub(stitute) function, we
    # find the regex in the row and if it matches, we replace it with nothing. That is, if the
    # first character is not a digit, substitute it with the empty string
    three_way.append(re.sub(three, '', row))
#     print(re.sub(three, '', row))
    
    # The split function does pretty much what you'd expect. It will look at the string in our
    # row variable, see if it matches the regular expression, and split the string there.
    # This only works because we know how the unclean value is formatted though!
    # If the first character is not a decimal value, then the first element in the list is '' and the second
    # would be the value we want. If the first character is a decimal value, then the first element in the list
    # is what we want to append. As such, we can get this element by appending the -1st element in the list.
    # That is, the last element in the list.
    four_way.append(re.split(three, row)[-1])
#     print(re.split(three, row))

df["one_way"] = one_way
df["two_way"] = two_way
df["three_way"] = three_way
df["four_way"] = four_way

# If you look at the table the one_way and two_way column has less digits than the three_way and
# four_way one, but don't worry, it has the same values. We can print it out above if you wanna check
# I think it shows up like that so it'd fit on the page

df

In [None]:
# case 2: Characters we don't want are intermingled with ones we do
# We know that we only want english letters


# Column More unclean values: this column, unlike the one before, has letters and weird characters
#   mixed in between. The weird characters are random unicode values so they may be what we want but also not.

# Find strings made of just letters. Cases doesn't matter
zulu = r'[a-zA-Z]'
yankee = r'[^a-zA-z]'

# search, findall, sub, split

zu = []
ya = []
xr = []
wh = []

for row in df['More unclean values']:

    # We used a search in the last cleaning example because we knew only one was dirty,
    # but here, there's multiple to find. Using this code, we will only find the first alphabetical values we 
    # want, but nothing else.
    zu.append(re.search(zulu, row)[0])
    
    # Here, we're finding all instances where the string in the row variable match the regex we gave zulu
    # that is, we're getting all the a-z characters, mapping the str function to all of them to make sure they're
    # strings, and joining them all together with ''. Afterwards, we append it to the zu list.
    yan = ''.join(re.findall(zulu, row))
    ya.append(yan)

    # Here, like in the previous example, we substitute values we don't want with an empty string and append
    # them to the list
    xra = re.sub(yankee, '', row)
    xr.append(xra)
    
    # we didn't have to do this in the previous example with the split function, but here, we need to get
    # all the values we found with the split, join them together with nothing in between, and then append them
#     whi = re.split(yankee, row)
    whi = ''.join(re.split(yankee, row))
    wh.append(whi)
    

    
    
df["zulu"] = zu
df["yankee"] = ya
df["xray"] = xr
df["whisky"] = wh
df
# df.iloc[:,0-2] 
df[['Name', 'More unclean values', 'zulu', 'yankee', 'xray', 'whisky']]

In [None]:
# Column phone numbers: I picked a handful of area codes and then slapped on some numbers.

area_code = []
# In order to find characters such as ) and ( we need to use the \ escape character
# Using the {3} we're specifying that we want only 3 digits
find_area_code = r'\([0-9]{3}\)'
for row in df['phone numbers']:
    string_val = re.findall(find_area_code, row)[0]
    area_code.append(int(string_val[1:-1]))

print(area_code)
df["area code"] = area_code
df[['Name', 'phone numbers', 'area code']]

In [None]:
df[['Name', 'emojis']]

In [None]:
find_emojis = r'\(+[^a-zA-z]+\)+|\:\)|\:\(|ORZ|OTZ|XD|OAO|\(+[^a-zA-z].*ω.*\)+|\(+[^a-zA-z].*ᴗ.*\)+|\(+[^a-zA-z].*ᵕ.*\)+'
is_emoji = []
emoji_is = []

for row in df['emojis']:
    emoj = re.findall(find_emojis, row)
    if emoj:
        is_emoji.append("true")
        emoji_is.append(emoj)
    else:
        is_emoji.append("false")
        emoji_is.append("couldn't find")
df["is_emoji"] = is_emoji
df["emoji_is"] = emoji_is
df[['Name','emojis','is_emoji', 'emoji_is']]