## 處理欄位資料同時有文字與數字情況

In [1]:
import pandas
df = pandas.DataFrame([['mary', 30000], ['Joe', 'NULL']])

In [4]:
df.columns = ['name', 'salary']

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
name      2 non-null object
salary    2 non-null object
dtypes: object(2)
memory usage: 112.0+ bytes


In [6]:
df

Unnamed: 0,name,salary
0,mary,30000.0
1,Joe,


In [7]:
import numpy as np
df['salary'] = df['salary'].map(lambda e: np.nan if e == 'NULL' else e)

In [9]:
df['salary'].describe()

count        1.0
mean     30000.0
std          NaN
min      30000.0
25%      30000.0
50%      30000.0
75%      30000.0
max      30000.0
Name: salary, dtype: float64

## 正規表達法

In [10]:
a = 'my phone number is 0912345678'
'0912345678' in a

True

In [11]:
import re
s = 'w'
re.search('w', s)

<_sre.SRE_Match object; span=(0, 1), match='w'>

In [12]:
re.search('p', s)

### 比對[]任一字元

In [13]:
# [] => 比對[]任一字元
re.search('[abcdefghijklmnopqrstuvwxyz]', s)

<_sre.SRE_Match object; span=(0, 1), match='w'>

### 使用-表示連續

In [14]:
# -  => continuous
re.search('[a-z]', s)

<_sre.SRE_Match object; span=(0, 1), match='w'>

In [15]:
s = 'W'
re.search('[a-z]', s)

In [16]:
re.search('[a-zA-Z]', s)

<_sre.SRE_Match object; span=(0, 1), match='W'>

### 使用 \w 表示字串與數字

In [17]:
# \w => [a-zA-Z0-9]
re.search('\w', s)

<_sre.SRE_Match object; span=(0, 1), match='W'>

### 使用\d 表示數字

In [18]:
# \d => [0-9]
n = '5'
re.search('\d', s)

In [19]:
import re
s = 'apple'
re.search('\w', s)

<_sre.SRE_Match object; span=(0, 1), match='a'>

### 使用{} 比對多個字元

In [20]:
# {n} : 比對n 個字元
re.search('\w{5}', s)

<_sre.SRE_Match object; span=(0, 5), match='apple'>

In [21]:
# {n,m} : 比對至少n 個字元, 至多m 個字元
re.search('\w{3,6}', s)

<_sre.SRE_Match object; span=(0, 5), match='apple'>

In [22]:
# {0,} : 比對至少0 個以上字元
re.search('\w{0,}', s)

<_sre.SRE_Match object; span=(0, 5), match='apple'>

In [23]:
# {1,} : 比對至少1 個以上字元
re.search('\w{1,}', s)

<_sre.SRE_Match object; span=(0, 5), match='apple'>

### 使用 * 代表 比對0 個以上字元

In [24]:
# * => {0, }
re.search('\w*', s)

<_sre.SRE_Match object; span=(0, 5), match='apple'>

### 使用 + 代表 比對1 個以上字元

In [25]:
# + => {1, }
re.search('\w+', s)

<_sre.SRE_Match object; span=(0, 5), match='apple'>

### 比對電話號碼

In [30]:
phones = ['0912345678','0912-345-678','0912-345678','091234567881903891089084908403']
for p in phones:
    #m = re.search('09\d{8}',p)
    #m = re.search('09\d{2}-{0,1}\d{3}-{0,1}\d{3}',p)
    
    #? => {0,1}
    #m = re.search('09\d{2}-?\d{3}-?\d{3}',p)
    
    # ^ 從頭開始比對, $ 確定以該樣式做結尾
    m = re.search('^09\d{2}-?\d{3}-?\d{3}$',p)
    print(m)

<_sre.SRE_Match object; span=(0, 10), match='0912345678'>
<_sre.SRE_Match object; span=(0, 12), match='0912-345-678'>
<_sre.SRE_Match object; span=(0, 11), match='0912-345678'>
None


## 字典 Dictionary

In [1]:
a = [50,90,60,70,52]
a[3] 

70

In [2]:
for idx, ele in enumerate(a):
    if ele == 70:
        print(idx)

3


In [3]:
dic = {'1': 50, '2':90, '3':60, '4': 70, '5':52}
dic.get('4')

70

In [7]:
qa = {
 '早安': '你也早安',
 '中午了':'要一起吃中餐嗎?',
 '再見': '掰掰'   
}

qa.get('再見')



'掰掰'

## BeautifulSoup 

In [8]:
page = '''

<html>
<head>

</head>


<body>

        <h1> Hello World</h1>
        <a href="https://www.largitdata.com"> LargitData</a>
        <a href="https://www.google.com"> Google</a>

</body>


</html>
'''

In [9]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page, 'lxml')

In [10]:
soup.select_one('h1')

<h1> Hello World</h1>

In [11]:
soup.select_one('h1').text

' Hello World'

In [12]:
soup.select_one('a')

<a href="https://www.largitdata.com"> LargitData</a>

In [13]:
soup.select('a')

[<a href="https://www.largitdata.com"> LargitData</a>,
 <a href="https://www.google.com"> Google</a>]

In [15]:
for link in soup.select('a'):
    print(link.get('href'))

https://www.largitdata.com
https://www.google.com
