## 自由時報爬蟲

### 使用requests 抓取資料

In [38]:
import requests
res = requests.get('http://news.ltn.com.tw/list/breakingnews')

In [2]:
res

<Response [200]>

In [4]:
#res.text

### 使用BeautifulSoup 解析資料

In [5]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(res.text, 'html.parser')

In [7]:
news = soup.select('.imm li')

In [29]:
newsary = []
for li in news:
    #print(li)
    title = li.select_one('p').text.strip()
    time  = li.select_one('span').text.strip()
    link  = li.select_one('a').get('href')
    tags  = ','.join([tag.text.strip() for tag in li.select('.immtag')])
    res   = {'time':time, 'title':title, 'link':link, 'tags':tags}
    newsary.append(res)
    #print('=====================================')

### 使用Pandas 整理出結構化資料

In [31]:
import pandas
newsdf = pandas.DataFrame(newsary)

In [33]:
newsdf.head()

Unnamed: 0,link,tags,time,title
0,http://news.ltn.com.tw/news/world/breakingnews...,國際,09:20,傳統超商小心！ 亞馬遜無人商店開張了
1,http://news.ltn.com.tw/news/politics/breakingn...,政治,09:14,不選台北市長 蔣萬安、柯建銘隔空交火互薦讀「書」
2,http://news.ltn.com.tw/news/life/breakingnews/...,"生活,基隆市",09:12,天天吃好料》基隆「流氓滷味攤」 藝人謝昕回家必吃
3,http://sports.ltn.com.tw/news/breakingnews/231...,體育,09:12,羽球》最難打的對手！泰國一姐奪冠後謙虛讚美戴資穎
4,http://news.ltn.com.tw/news/politics/breakingn...,政治,09:11,中國直升機1小時襲台？美飛官：台灣比阿富汗戰區可怕！


In [36]:
newsdf.to_excel('news.xlsx')

## BeautifulSoup4 說明文檔
- https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
- https://www.crummy.com/software/BeautifulSoup/bs4/doc/

## 抓取證交所資料

### 測出目標伺服器容忍值(rate limit)
- https://www.nginx.com/blog/rate-limiting-nginx/

In [None]:
import requests
import time
for i in range(1,100):
    res = requests.get('http://www.tse.com.tw/exchangeReport/MI_5MINS_INDEX?response=json&date=&_=1516587389044')
    time.sleep(2)
    print(res)

### 不斷更換IP 位置 (使用Proxy)

In [47]:
import requests
proxies = {
    'http':'http://137.74.168.174:8080',
    'https':'https://137.74.168.174:8080'
}
res = requests.get('http://www.tse.com.tw/exchangeReport/MI_5MINS_INDEX?response=json&date=&_=1516587389044', proxies = proxies)
res

<Response [200]>

### 使用Tor 

## Numpy

In [50]:
# Python List
a = [1,2, 'hello', 4, 5]

In [53]:
for ele in a:
    if isinstance(ele, int):
        print(ele + 3)

4
5
7
8


In [54]:
# Numpy array
a = [1,2,3,4,5]

import numpy
na = numpy.array(a)

In [55]:
na

array([1, 2, 3, 4, 5])

In [57]:
na + 3

array([4, 5, 6, 7, 8])

In [58]:
na[0]

1

In [60]:
na[1:3]

array([2, 3])

In [62]:
na > 3

array([False, False, False,  True,  True], dtype=bool)

In [63]:
na[na > 3]

array([4, 5])

In [64]:
na = numpy.array([[1,2,3,4,5],[6,7,8,9,10]])

In [65]:
na

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])

In [66]:
na[ 0 , 0 ]

1

In [68]:
na[ 0 , : ]

array([1, 2, 3, 4, 5])

In [69]:
na[ : ,  0 ]

array([1, 6])

In [70]:
na.T

array([[ 1,  6],
       [ 2,  7],
       [ 3,  8],
       [ 4,  9],
       [ 5, 10]])

In [72]:
na = numpy.array([['frank', 'M', 29], ['mary', 'F', 23], ['tom', 'M',
35], ['ted', 'M', 33], ['jean', 'F', 21], ['lisa', 'F', 20]])
na

array([['frank', 'M', '29'],
       ['mary', 'F', '23'],
       ['tom', 'M', '35'],
       ['ted', 'M', '33'],
       ['jean', 'F', '21'],
       ['lisa', 'F', '20']],
      dtype='<U5')

In [74]:
na = numpy.array([['name', 'gender', 'age'], ['frank', 'M', 29], ['mary', 'F', 23],
['tom', 'M', 35], ['ted', 'M', 33], ['jean', 'F', 21], ['lisa', 'F', 20]]) 

In [77]:
na[ 1: 7 ,  1 ]

array(['M', 'F', 'M', 'M', 'F', 'F'],
      dtype='<U6')

In [80]:
import pandas
df = pandas.DataFrame([['frank', 'M', 29], ['mary', 'F', 23],
['tom', 'M', 35], ['ted', 'M', 33], ['jean', 'F', 21], ['lisa', 'F', 20]])

In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
0    6 non-null object
1    6 non-null object
2    6 non-null int64
dtypes: int64(1), object(2)
memory usage: 224.0+ bytes


In [83]:
df

Unnamed: 0,0,1,2
0,frank,M,29
1,mary,F,23
2,tom,M,35
3,ted,M,33
4,jean,F,21
5,lisa,F,20


In [85]:
df.columns

RangeIndex(start=0, stop=3, step=1)

In [86]:
df.columns = ['name', 'gender', 'age']

In [87]:
df

Unnamed: 0,name,gender,age
0,frank,M,29
1,mary,F,23
2,tom,M,35
3,ted,M,33
4,jean,F,21
5,lisa,F,20


## Pandas Series

In [88]:

s = pandas.Series([21000,18900,18000])

In [89]:
s

0    21000
1    18900
2    18000
dtype: int64

In [90]:
pandas.Series([21000,18900,18000], index=['Iphone','Edge S7','HTC M1'])

Iphone     21000
Edge S7    18900
HTC M1     18000
dtype: int64

In [92]:
s = pandas.Series({'Iphone':21000,'Edge S7':18900, 'HTC M1':18000})

In [93]:
s

Edge S7    18900
HTC M1     18000
Iphone     21000
dtype: int64

In [95]:
s[2]

21000

In [96]:
s[1:3]

HTC M1    18000
Iphone    21000
dtype: int64

In [98]:
s[s < 20000]

Edge S7    18900
HTC M1     18000
dtype: int64

In [99]:
cheap = s < 20000

In [100]:
s[cheap]

Edge S7    18900
HTC M1     18000
dtype: int64

In [102]:
s

Edge S7    18900
HTC M1     18000
Iphone     21000
dtype: int64

In [101]:
s[2]

21000

In [103]:
s['Iphone']

21000

In [104]:
s['Iphone'] = 20000

In [105]:
s

Edge S7    18900
HTC M1     18000
Iphone     20000
dtype: int64

In [109]:
s[s < 20000]  = s[s < 20000] * 0.8

In [110]:
s

Edge S7    15120
HTC M1     14400
Iphone     20000
dtype: int64

In [111]:
'Iphone' in s

True

In [112]:
'mi' in s

False

In [113]:
s * 0.8

Edge S7    12096.0
HTC M1     11520.0
Iphone     16000.0
dtype: float64

In [114]:
numpy.sqrt(s)

Edge S7    122.963409
HTC M1     120.000000
Iphone     141.421356
dtype: float64

In [115]:
s.max()

20000

In [116]:
s.min()

14400

In [117]:
s.mean()

16506.666666666668

In [118]:
s.describe()

count        3.000000
mean     16506.666667
std       3046.659373
min      14400.000000
25%      14760.000000
50%      15120.000000
75%      17560.000000
max      20000.000000
dtype: float64

In [119]:
s[['Iphone', 'HTC M1']]

Iphone    20000
HTC M1    14400
dtype: int64

In [120]:
s[['mi', 'HTC M1']]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


mi            NaN
HTC M1    14400.0
dtype: float64

In [124]:
s2 = s[['Iphone', 'HTC M1']] + s[['mi', 'HTC M1']]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


In [125]:
s2

HTC M1    28800.0
Iphone        NaN
mi            NaN
dtype: float64

In [127]:
s2.isnull()

HTC M1    False
Iphone     True
mi         True
dtype: bool

In [128]:
s2.notnull()

HTC M1     True
Iphone    False
mi        False
dtype: bool