In [1]:
import pandas
import numpy 

import seaborn

"""Vectorized String Operations"""

"""Pandas string operations"""

"""Example vectorized for number"""
x = numpy.array([2, 3, 5, 6, 11, 13])
x * 2

array([ 4,  6, 10, 12, 22, 26])

In [2]:
"""But numpy don't support string"""
data = ["peter", "Paul", "MARY", "gUIDO"]
[string.capitalize() for string in data]

['Peter', 'Paul', 'Mary', 'Guido']

In [3]:
"""Nhưng cách phía trên vừa chậm vừa yếu ớt"""
"""Đặc biệt với dữ liệu trong thế giới thực """
data = ["peter", "Paul", None, "MARY", "gUIDO"]
[string.capitalize() for string in data]



AttributeError: 'NoneType' object has no attribute 'capitalize'

In [4]:
"""Pandas có nhiều features để xử lý cả hai vấn đề trên: vectorized string operations và xử lý missing data"""
"""Thông qua thuộc tính .str của Series và Index objects của Pandas."""
names = pandas.Series(data)
names

0    peter
1     Paul
2     None
3     MARY
4    gUIDO
dtype: object

In [5]:
names.str.capitalize()

0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object

In [6]:
monte = pandas.Series([
    'Graham Chapman', 'John Cleese', 'Terry Gilliam',
    'Eric Idle', 'Terry Jones', 'Michael Palin'     
])

monte.str.lower()

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

In [7]:
monte.str.len()

0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64

In [8]:
monte.str.startswith("T")

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

In [9]:
monte.str.split()

0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object

In [13]:
"""Methods sử dụng regular expressions"""
"""Mapping tới các method trong module re của Python"""

monte.str.extract("([A-Za-z]+) ([A-Za-z]+)")

Unnamed: 0,0,1
0,Graham,Chapman
1,John,Cleese
2,Terry,Gilliam
3,Eric,Idle
4,Terry,Jones
5,Michael,Palin


In [14]:
monte.str.findall(r"^[^AEIOU].*[^aeiou]$")

0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object

In [15]:

"""MISCELLANEOUS methods"""

"""Vectorized item access và slicing"""

monte.str[0: 3]

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [17]:
monte.str.split().str[-1]

0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

In [18]:
"""Indicator variables"""

fullMonte = pandas.DataFrame({
    "name": monte,
    "info": ["B|C|D", "B|D", "A|C", "B|D", "B|C", "B|C|D"]
})

fullMonte

Unnamed: 0,name,info
0,Graham Chapman,B|C|D
1,John Cleese,B|D
2,Terry Gilliam,A|C
3,Eric Idle,B|D
4,Terry Jones,B|C
5,Michael Palin,B|C|D


In [19]:
fullMonte["info"].str.get_dummies("|")

Unnamed: 0,A,B,C,D
0,0,1,1,1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,1,0
5,0,1,1,1


In [20]:
"""Example: recipe database"""

try:
    recipes = pandas.read_json("recipeitems-latest.json")
except ValueError as exception:
    print("Value error: ", exception)

Value error:  Expected object or value


In [27]:
with open("20170107-061401-recipeitems.json", "r") as file:
    data = (line.strip() for line in file)
    dataJson = "[{}]".format(",".join(data))
    
recipes = pandas.read_json(dataJson)
recipes.shape

(173278, 17)

In [28]:
recipes.iloc[0]

_id                                {'$oid': '5160756b96cc62079cc2db15'}
name                                    Drop Biscuits and Sausage Gravy
ingredients           Biscuits\n3 cups All-purpose Flour\n2 Tablespo...
url                   http://thepioneerwoman.com/cooking/2013/03/dro...
image                 http://static.thepioneerwoman.com/cooking/file...
ts                                             {'$date': 1365276011104}
cookTime                                                          PT30M
source                                                  thepioneerwoman
recipeYield                                                          12
datePublished                                                2013-03-11
prepTime                                                          PT10M
description           Late Saturday afternoon, after Marlboro Man ha...
totalTime                                                           NaN
creator                                                         

In [29]:
recipes.ingredients.str.len().describe()

count    173278.000000
mean        244.617926
std         146.705285
min           0.000000
25%         147.000000
50%         221.000000
75%         314.000000
max        9067.000000
Name: ingredients, dtype: float64

In [30]:
recipes.name[numpy.argmax(recipes.ingredients.str.len())]

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  return getattr(obj, method)(*args, **kwds)


'Carrot Pineapple Spice &amp; Brownie Layer Cake with Whipped Cream &amp; Cream Cheese Frosting and Marzipan Carrots'

In [31]:
recipes.description.str.contains("[Bb]reakfast").sum()

3524

In [32]:
recipes.ingredients.str.contains("[Cc]innamon").sum()

10526

In [33]:
"""Tạo một recipe recommender đơn giản"""
"""Cung cấp một danh sách ingredients, tìm một recipe sử dụng tất cả các ingredients ấy"""

spiceList = [
    'salt', 'pepper', 'oregano', 'sage', 'parsley', 'rosemary', 'tarragon', 'thyme', 'paprika', 'cumin'
]
import re
spiceDataFrame = pandas.DataFrame(
    dict((spice, recipes.ingredients.str.contains(spice, re.IGNORECASE)) for spice in spiceList)
)
spiceDataFrame.head()

Unnamed: 0,salt,pepper,oregano,sage,parsley,rosemary,tarragon,thyme,paprika,cumin
0,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,True,True,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False


In [34]:
selection = spiceDataFrame.query("parsley & paprika & tarragon")
len(selection)

10

In [35]:
recipes.name[selection.index]

2069      All cremat with a Little Gem, dandelion and wa...
74964                         Lobster with Thermidor butter
93768      Burton's Southern Fried Chicken with White Gravy
113926                     Mijo's Slow Cooker Shredded Beef
137686                     Asparagus Soup with Poached Eggs
140530                                 Fried Oyster Po’boys
158475                Lamb shank tagine with herb tabbouleh
158486                 Southern fried chicken in buttermilk
163175            Fried Chicken Sliders with Pickles + Slaw
165243                        Bar Tartine Cauliflower Salad
Name: name, dtype: object