In [1]:
# Import the dependencies
import pandas as pd
import re
# Set the width of the columns. 
pd.options.display.max_colwidth = 200

In [2]:
# Read in the dataset
grocery_orders_df = pd.read_excel("../Resources/grocery_orders.xlsx")
grocery_orders_df

Unnamed: 0,"upc12,item_name,cases_ordered,last_ordered"
0,"#1576031803,Pasta - Fusilli,5,1622507126"
1,"#6013764351,Cheese - Cottage Cheese,2,1637781492"
2,"#6305148899,Beef - Sushi Flat Iron Steak,6,1612314977"
3,"#1307609509,Chicken - Ground,4,1607654887"
4,"#7807970757,Pasta - Penne Rigate,1,1635053801"
5,"#1607928386,Chicken - Wings,10,1616511242"
6,"#5746909635,Beef - Texas Style Burger,15,1632542090"
7,"#7750071232,Shrimp - Jumbo Gulf,12,1627642089"
8,"#3050673785,Cheese - Mozzarella,2,1629086003"
9,"#1947489471,Shrimp - Argentina Red,8,1627642089"


In [3]:
# Use a list comprehension to get the columns in a list.
grocery_orders_cols = grocery_orders_df.columns.to_list()
grocery_orders_cols[0]

'upc12,item_name,cases_ordered,last_ordered'

In [4]:
# Split the item in the list on the comma.
column_names = re.split(',', grocery_orders_cols[0])
print(column_names)

['upc12', 'item_name', 'cases_ordered', 'last_ordered']


In [5]:
# Split the column on the comma using the special sequence, `\W`.
column_names = re.split('\W', grocery_orders_cols[0])
print(column_names)

['upc12', 'item_name', 'cases_ordered', 'last_ordered']


In [6]:
# Split the column on the comma and underscore.
column_names = re.split('\W|_', grocery_orders_cols[0])
print(column_names)

['upc12', 'item', 'name', 'cases', 'ordered', 'last', 'ordered']


In [7]:
# Get each row into a list.
grocery_orders_list = grocery_orders_df.values.tolist()

# Split the values of the first row in the list on the non-word character.
first_values = re.split('\W', grocery_orders_list[0][0])
print(first_values)

['', '1576031803', 'Pasta', '', '', 'Fusilli', '5', '1622507126']


### Matching digit characters. 

In [8]:
# Find all the numbers
numbers_only = re.findall('[0-9]', grocery_orders_list[0][0])
print(numbers_only)

['1', '5', '7', '6', '0', '3', '1', '8', '0', '3', '5', '1', '6', '2', '2', '5', '0', '7', '1', '2', '6']


In [9]:
# Find more one or more numbers together.
numbers_only = re.findall('[0-9]+', grocery_orders_list[0][0])
print(numbers_only)

['1576031803', '5', '1622507126']


In [10]:
# Find the ten digit numbers
ten_digit_numbers = re.findall('[0-9]{10}', grocery_orders_list[0][0])
print(ten_digit_numbers)

['1576031803', '1622507126']


### Matching non-digit characters

In [11]:
# Find non-digit characters only.
non_digit_only = re.findall('[^0-9]', grocery_orders_list[0][0])
print(non_digit_only)

['#', ',', 'P', 'a', 's', 't', 'a', ' ', '-', ' ', 'F', 'u', 's', 'i', 'l', 'l', 'i', ',', ',']


In [12]:
# Find more than one non-digit characters together.
non_digit_only = re.findall('\D+', grocery_orders_list[0][0])
non_digit_only

['#', ',Pasta - Fusilli,', ',']

In [13]:
# To avoid the hash-tag use [^#]. 
non_digit_only = re.findall('[^#]\D+', grocery_orders_list[0][0])
non_digit_only

['3,Pasta - Fusilli,', '5,']

In [14]:
# To avoid the numbers use [^\d]. 
non_digit_only = re.findall('[^\d]\D+', grocery_orders_list[0][0])
non_digit_only

[',Pasta - Fusilli,']

In [15]:
# To avoid the leading comma and numbers use [^\d,]. 
non_digit_only = re.findall('[^\d,]\D+', grocery_orders_list[0][0])
non_digit_only

['Pasta - Fusilli,']

In [16]:
# To avoid the leading and trailing comma and numbers use [^\d,] before and after the \D+.
non_digit_only = re.findall('[^\d,]\D+[^\d,]', grocery_orders_list[0][0])
non_digit_only

['Pasta - Fusilli']

In [17]:
# Find only the text without non-word characters. 
non_digit_only = re.findall('[^\d\W]\D+[^\d\W]', grocery_orders_list[0][0])
non_digit_only

['Pasta - Fusilli']

In [18]:
# Use a for loop to clean up the grocery order list.
for item in grocery_orders_list:
    print(re.findall('[^\d,]\D+[^\d,]', item[0])[0])

Pasta - Fusilli
Cheese - Cottage Cheese
Beef - Sushi Flat Iron Steak
Chicken - Ground
Pasta - Penne Rigate
Chicken - Wings
Beef - Texas Style Burger
Shrimp - Jumbo Gulf
Cheese - Mozzarella
Shrimp - Argentina Red


In [19]:
# Use a list comprehension to clean up the grocery order list. 
ordered_items = [re.findall('[^\d,]\D+[^\d,]', item[0])[0] for item in grocery_orders_list]
ordered_items

['Pasta - Fusilli',
 'Cheese - Cottage Cheese',
 'Beef - Sushi Flat Iron Steak',
 'Chicken - Ground',
 'Pasta - Penne Rigate',
 'Chicken - Wings',
 'Beef - Texas Style Burger',
 'Shrimp - Jumbo Gulf',
 'Cheese - Mozzarella',
 'Shrimp - Argentina Red']