# Pandas

- 命名的起源: 三個新的資料結構
    - `Panel`
    - `DataFrame`
    - `Series`
    
<https://github.com/pandas-dev/pandas>

> Flexible and powerful data analysis / manipulation library for Python, providing labeled data structures similar to R data.frame objects, statistical functions, and much more.

- Installation

```bash
# run in command line
pip install pandas
```
- Import

```python
import pandas as pd
```

- pandas 在資料科學的應用
    - 提供 `DataFrame` 資料結構
    - 解決表格式資料的載入, 處理與分析
        - `.txt`, `.csv`
        - `.json` Array of Objects
        - SQL 資料庫表格
    - 視覺化: `matplotlib.pyplot` 的 high-level API

In [1]:
import requests
from bs4 import BeautifulSoup

r = requests.get("https://tw.stock.yahoo.com/d/i/rank.php?t=pri&e=tse&n=100")
html_str = r.text
soup = BeautifulSoup(html_str)
table_contents = soup.find_all("td")[2].find_all("td")[3:]
n_data = len(table_contents)
ticker_names = [table_contents[i].text for i in range(n_data) if i % 10 == 1]
prices = [float(table_contents[i].text) for i in range(n_data) if i % 10 == 2]
volumes = [int(table_contents[i].text.replace(",", "")) for i in range(n_data) if i % 10 == 8]
mkt_values = [float(table_contents[i].text)*100000000 for i in range(n_data) if i % 10 == 9]

In [2]:
import pandas as pd 

stock_df = pd.DataFrame()

In [3]:
print(type(stock_df))

<class 'pandas.core.frame.DataFrame'>


In [4]:
stock_df["ticker_name"] = ticker_names
stock_df["price"] = prices
stock_df["volume"] = volumes
stock_df["mkt_value"] = mkt_values

In [5]:
stock_df[stock_df["ticker_name"].str.contains("KY")]

Unnamed: 0,ticker_name,price,volume,mkt_value
4,6415 矽力-KY,466.0,236,110510000.0
9,1590 亞德客-KY,336.0,911,306650000.0
14,4137 麗豐-KY,275.0,578,160080000.0
26,6452 康友-KY,228.5,365,83680000.0
35,3665 貿聯-KY,197.0,715,140870000.0
36,2723 美食-KY,186.0,117,21710000.0
38,6666 羅麗芬-KY,180.5,105,19230000.0
40,1256 鮮活果汁-KY,175.5,4,700000.0
53,4763 材料-KY,150.0,248,37340000.0
56,5288 豐祥-KY,143.5,131,18760000.0


In [6]:
stock_df[~stock_df["ticker_name"].str.contains("KY")]

Unnamed: 0,ticker_name,price,volume,mkt_value
0,3008 大立光,4085.0,328,1.341500e+09
1,6409 旭隼,607.0,3,1.820000e+06
2,2207 和泰車,495.5,635,3.170800e+08
3,5269 祥碩,470.5,577,2.706900e+08
5,1476 儒鴻,425.0,380,1.603900e+08
6,3563 牧德,372.5,635,2.381800e+08
7,6669 緯穎,371.5,2668,9.857500e+08
8,3406 玉晶光,365.0,11221,4.131240e+09
10,2912 統一超,309.5,807,2.502400e+08
11,2059 川湖,309.0,71,2.194000e+07


In [7]:
stock_df.head()

Unnamed: 0,ticker_name,price,volume,mkt_value
0,3008 大立光,4085.0,328,1341500000.0
1,6409 旭隼,607.0,3,1820000.0
2,2207 和泰車,495.5,635,317080000.0
3,5269 祥碩,470.5,577,270690000.0
4,6415 矽力-KY,466.0,236,110510000.0


In [8]:
stock_df[["ticker_name","volume"]].head()

Unnamed: 0,ticker_name,volume
0,3008 大立光,328
1,6409 旭隼,3
2,2207 和泰車,635
3,5269 祥碩,577
4,6415 矽力-KY,236


In [9]:
stock_df.sort_values("volume")

Unnamed: 0,ticker_name,price,volume,mkt_value
42,3130 一零四,170.0,1,1.700000e+05
1,6409 旭隼,607.0,3,1.820000e+06
40,1256 鮮活果汁-KY,175.5,4,7.000000e+05
45,6504 南六,164.0,6,9.900000e+05
64,8480 泰昇-KY,129.0,13,1.670000e+06
17,8454 富邦媒,262.5,16,4.200000e+06
47,1537 廣隆,163.0,19,3.110000e+06
61,1723 中碳,133.5,19,2.530000e+06
43,8422 可寧衛,168.5,20,3.370000e+06
94,6581 鋼聯,96.3,22,2.120000e+06


In [10]:
type(stock_df["ticker_name"]) #往下break down一層

pandas.core.series.Series

In [11]:
stock_df["volume"].index

RangeIndex(start=0, stop=100, step=1)

In [12]:
list(stock_df["volume"].index)

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99]

In [13]:
stock_df["volume"].values

array([  328,     3,   635,   577,   236,   380,   635,  2668, 11221,
         911,   807,    71,  2802,   668,   578,   285,   257,    16,
        3957,   581,  1449,   218, 23952,   792,   298,   449,   365,
         965,  2151,   592,   909,   270,  4970,   617,  2758,   715,
         117,   565,   105,  9253,     4,   953,     1,    20,  5655,
           6,  1869,    19,   144,   304,    72,   155,  2503,   248,
         120,  5316,   131,   166,  7349,  4157,   126,    19,   248,
        4425,    13,  5622,  2222,    23,  2549,  1143,   111,   635,
        1166,   108,    30,    28,  2863,  2260,   185,  2906,  5822,
        2412,   517,   359,  1872,    39,   120,   633,   181,    61,
         515,   744,  2612,  1836,    22,   128,   466,    57,  5899,
          35], dtype=int64)

In [14]:
type(stock_df["volume"].values)

numpy.ndarray

In [15]:
stock_df.index

RangeIndex(start=0, stop=100, step=1)

In [16]:
stock_df.columns

Index(['ticker_name', 'price', 'volume', 'mkt_value'], dtype='object')

In [17]:
stock_df.dtypes

ticker_name     object
price          float64
volume           int64
mkt_value      float64
dtype: object

## 手動創建 DataFrame

- `pd.Series()`
- `pd.DataFrame()`

In [18]:
import pandas as pd

In [19]:
cities = ["Taipei", "New York", "London", "Reykjavik", "Tokyo", "Honolulu"]
current_temps = [26, 19, 9, 0, 27, 24]
city_ser = pd.Series(cities)
print(city_ser)

0       Taipei
1     New York
2       London
3    Reykjavik
4        Tokyo
5     Honolulu
dtype: object


In [20]:
city_ser[5]

'Honolulu'

In [21]:
city_ser[::2]

0    Taipei
2    London
4     Tokyo
dtype: object

In [22]:
city_ser[city_ser == "Taipei"]

0    Taipei
dtype: object

In [23]:
city_ser[[1,5]]

1    New York
5    Honolulu
dtype: object

In [24]:
temp_ser = pd.Series(current_temps)
temp_ser.index = cities
temp_ser

Taipei       26
New York     19
London        9
Reykjavik     0
Tokyo        27
Honolulu     24
dtype: int64

In [25]:
temp_ser["New York"]

19

In [26]:
temp_ser[temp_ser == temp_ser.min()].index[0]

'Reykjavik'

In [27]:
temp_ser.idxmin()

'Reykjavik'

In [28]:
temp_ser.idxmax()

'Tokyo'

In [29]:
temp_ser[["New York", "Honolulu"]]

New York    19
Honolulu    24
dtype: int64

In [30]:
import pandas as pd

cities = ["Taipei", "New York", "London", "Reykjavik", "Tokyo", "Honolulu"]
current_temps = [26, 19, 9, 0, 27, 24]
countries = ["Taiwan", "United States", "United Kingdom", "Iceland", "Japan", "United States"]
continents = ["Asia", "North America", "Europe", "Europe", "Japan", "North America"]
city_dict = {
    "city": cities,
    "current_temp": current_temps,
    "country": countries,
    "continent": continents
}
city_df = pd.DataFrame(city_dict, columns=['current_temp', 'continent', 'country', 'city'])
city_df # 不建議把 index 改成從 1 開始編碼

Unnamed: 0,current_temp,continent,country,city
0,26,Asia,Taiwan,Taipei
1,19,North America,United States,New York
2,9,Europe,United Kingdom,London
3,0,Europe,Iceland,Reykjavik
4,27,Japan,Japan,Tokyo
5,24,North America,United States,Honolulu


In [31]:
city_df.set_index("city")

Unnamed: 0_level_0,current_temp,continent,country
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Taipei,26,Asia,Taiwan
New York,19,North America,United States
London,9,Europe,United Kingdom
Reykjavik,0,Europe,Iceland
Tokyo,27,Japan,Japan
Honolulu,24,North America,United States


In [32]:
city_df = city_df.set_index("city") #要存起來，不然上面的結果就只是 return 結果

In [33]:
city_df

Unnamed: 0_level_0,current_temp,continent,country
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Taipei,26,Asia,Taiwan
New York,19,North America,United States
London,9,Europe,United Kingdom
Reykjavik,0,Europe,Iceland
Tokyo,27,Japan,Japan
Honolulu,24,North America,United States


In [34]:
city_df["current_temp"].idxmax()

'Tokyo'

In [35]:
city_df["current_temp"].idxmin()

'Reykjavik'

In [36]:
city_df["current_temp"] * (9/5) + 32

city
Taipei       78.8
New York     66.2
London       48.2
Reykjavik    32.0
Tokyo        80.6
Honolulu     75.2
Name: current_temp, dtype: float64

## 寫入 .csv 檔
- 從連結
- 從本機: 如果跟 .py 檔在同一資料夾，就不用設定位址

In [37]:
xlsx_url = "https://storage.googleapis.com/ds_data_import/fav_nba_teams.xlsx"
excel_df = pd.read_excel(xlsx_url)
excel_df.head()

Unnamed: 0,No.,Player,Pos,Ht,Wt,Birth Date,College
0,0,Randy Brown,PG,6-2,190,"May 22, 1968","University of Houston, New Mexico State Univer..."
1,30,Jud Buechler,SF,6-6,220,"June 19, 1968",University of Arizona
2,35,Jason Caffey,PF,6-8,255,"June 12, 1973",University of Alabama
3,53,James Edwards,C,7-0,225,"November 22, 1955",University of Washington
4,54,Jack Haley,C,6-10,240,"January 27, 1964","University of California, Los Angeles"


In [38]:
boston = pd.read_excel(xlsx_url, sheet_name=1) #sheet_name 預設為 0
# boston = pd.read_excel(xlsx_url, sheet_name="boston_celtics_2007_2008")
boston

Unnamed: 0,No.,Player,Pos,Ht,Wt,Birth Date,College
0,20,Ray Allen,SG,6-5,205,"July 20, 1975",University of Connecticut
1,42,Tony Allen,SG,6-4,213,"January 11, 1982","Butler County Community College, Oklahoma Stat..."
2,93,P.J. Brown,PF,6-11,225,"October 14, 1969",Louisiana Tech University
3,28,Sam Cassell,PG,6-3,185,"November 18, 1969",Florida State University
4,11,Glen Davis,C,6-9,289,"January 1, 1986",Louisiana State University
5,5,Kevin Garnett,PF,6-11,240,"May 19, 1976",
6,50,Eddie House,PG,6-1,180,"May 14, 1978",Arizona State University
7,43,Kendrick Perkins,C,6-10,270,"November 10, 1984",
8,34,Paul Pierce,SF,6-7,235,"October 13, 1977",University of Kansas
9,66,Scot Pollard,C,6-11,265,"February 12, 1975",University of Kansas


## 基本的 DataFrame 方法、屬性與操作

- 方法 / 屬性
    - `.head()`
    - `.tail()`
    - `.shape`
    - `.dtypes`
    - `.info()`
    - `.index`
    - `.columns`
- 操作
    - filter
    - select
    - arrange
    - mutate
    - summarise
    - groupby

In [39]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

r = requests.get("https://tw.stock.yahoo.com/d/i/rank.php?t=pri&e=tse&n=100")
html_str = r.text
soup = BeautifulSoup(html_str)
table_contents = soup.find_all("td")[2].find_all("td")[3:]
n_data = len(table_contents)
ticker_names = [table_contents[i].text for i in range(n_data) if i % 10 == 1]
prices = [float(table_contents[i].text) for i in range(n_data) if i % 10 == 2]
volumes = [int(table_contents[i].text.replace(",", "")) for i in range(n_data) if i % 10 == 8]
mkt_values = [float(table_contents[i].text)*100000000 for i in range(n_data) if i % 10 == 9]
stock_df = pd.DataFrame()
stock_df["ticker_name"] = ticker_names
stock_df["price"] = prices
stock_df["volume"] = volumes
stock_df["mkt_value"] = mkt_values

In [40]:
stock_df.head()

Unnamed: 0,ticker_name,price,volume,mkt_value
0,3008 大立光,4085.0,336,1374160000.0
1,6409 旭隼,605.0,4,2430000.0
2,2207 和泰車,495.5,643,321040000.0
3,5269 祥碩,472.5,584,273990000.0
4,6415 矽力-KY,463.5,237,110970000.0


In [41]:
stock_df.tail()

Unnamed: 0,ticker_name,price,volume,mkt_value
95,6581 鋼聯,96.0,23,2210000.0
96,3044 健鼎,95.3,478,45440000.0
97,3617 碩天,94.8,65,6190000.0
98,6213 聯茂,94.2,5987,563260000.0
99,2373 震旦行,94.1,35,3300000.0


In [42]:
stock_df.shape

(100, 4)

In [43]:
stock_df.dtypes

ticker_name     object
price          float64
volume           int64
mkt_value      float64
dtype: object

In [44]:
stock_df["price"].values.dtype

dtype('float64')

In [45]:
stock_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
ticker_name    100 non-null object
price          100 non-null float64
volume         100 non-null int64
mkt_value      100 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 3.2+ KB


In [46]:
# filter - WHERE
stock_df.head()

Unnamed: 0,ticker_name,price,volume,mkt_value
0,3008 大立光,4085.0,336,1374160000.0
1,6409 旭隼,605.0,4,2430000.0
2,2207 和泰車,495.5,643,321040000.0
3,5269 祥碩,472.5,584,273990000.0
4,6415 矽力-KY,463.5,237,110970000.0


In [47]:
stock_df.loc[0,"ticker_name"] # 取出的是 string

'3008 大立光'

In [48]:
stock_df.loc[[0,2,4],"ticker_name"] # 取出的是 Series

0      3008 大立光
2      2207 和泰車
4    6415 矽力-KY
Name: ticker_name, dtype: object

In [49]:
stock_df.loc[[0, 2, 4], ["ticker_name"]]

Unnamed: 0,ticker_name
0,3008 大立光
2,2207 和泰車
4,6415 矽力-KY


In [50]:
stock_df.loc[[0, 1, 4], :]
#stock_df.loc[[0, 1, 4], ] 也可以

Unnamed: 0,ticker_name,price,volume,mkt_value
0,3008 大立光,4085.0,336,1374160000.0
1,6409 旭隼,605.0,4,2430000.0
4,6415 矽力-KY,463.5,237,110970000.0


In [51]:
stock_df[stock_df["price"] > 300]

Unnamed: 0,ticker_name,price,volume,mkt_value
0,3008 大立光,4085.0,336,1374160000.0
1,6409 旭隼,605.0,4,2430000.0
2,2207 和泰車,495.5,643,321040000.0
3,5269 祥碩,472.5,584,273990000.0
4,6415 矽力-KY,463.5,237,110970000.0
5,1476 儒鴻,423.5,390,164630000.0
6,3563 牧德,374.0,638,239300000.0
7,6669 緯穎,370.5,2731,1009130000.0
8,3406 玉晶光,363.5,11352,4178900000.0
9,1590 亞德客-KY,337.0,925,311370000.0


In [52]:
stock_df[(stock_df["price"] > 300) | (stock_df["price"] < 100)]
# stock_df["price"] > 300 外面要小括號，運算符的順序才會是對的

Unnamed: 0,ticker_name,price,volume,mkt_value
0,3008 大立光,4085.0,336,1374160000.0
1,6409 旭隼,605.0,4,2430000.0
2,2207 和泰車,495.5,643,321040000.0
3,5269 祥碩,472.5,584,273990000.0
4,6415 矽力-KY,463.5,237,110970000.0
5,1476 儒鴻,423.5,390,164630000.0
6,3563 牧德,374.0,638,239300000.0
7,6669 緯穎,370.5,2731,1009130000.0
8,3406 玉晶光,363.5,11352,4178900000.0
9,1590 亞德客-KY,337.0,925,311370000.0


In [53]:
# select: SELECT
stock_df[["ticker_name", "price", "volume"]]

Unnamed: 0,ticker_name,price,volume
0,3008 大立光,4085.0,336
1,6409 旭隼,605.0,4
2,2207 和泰車,495.5,643
3,5269 祥碩,472.5,584
4,6415 矽力-KY,463.5,237
5,1476 儒鴻,423.5,390
6,3563 牧德,374.0,638
7,6669 緯穎,370.5,2731
8,3406 玉晶光,363.5,11352
9,1590 亞德客-KY,337.0,925


In [54]:
# filter + select: SELECT ... WHERE
stock_df[stock_df["price"] > 300]["ticker_name"]

0        3008 大立光
1         6409 旭隼
2        2207 和泰車
3         5269 祥碩
4      6415 矽力-KY
5         1476 儒鴻
6         3563 牧德
7         6669 緯穎
8        3406 玉晶光
9     1590 亞德客-KY
10        2059 川湖
11       2912 統一超
Name: ticker_name, dtype: object

In [55]:
# arrange: ORDER BY
# sort_values
stock_df.sort_values(["price", "volume"], ascending= [False, False]) # 默認是 ASC 升冪

Unnamed: 0,ticker_name,price,volume,mkt_value
0,3008 大立光,4085.0,336,1.374160e+09
1,6409 旭隼,605.0,4,2.430000e+06
2,2207 和泰車,495.5,643,3.210400e+08
3,5269 祥碩,472.5,584,2.739900e+08
4,6415 矽力-KY,463.5,237,1.109700e+08
5,1476 儒鴻,423.5,390,1.646300e+08
6,3563 牧德,374.0,638,2.393000e+08
7,6669 緯穎,370.5,2731,1.009130e+09
8,3406 玉晶光,363.5,11352,4.178900e+09
9,1590 亞德客-KY,337.0,925,3.113700e+08


In [56]:
# sort_index
stock_df.sort_index(ascending=False).head()

Unnamed: 0,ticker_name,price,volume,mkt_value
99,2373 震旦行,94.1,35,3300000.0
98,6213 聯茂,94.2,5987,563260000.0
97,3617 碩天,94.8,65,6190000.0
96,3044 健鼎,95.3,478,45440000.0
95,6581 鋼聯,96.0,23,2210000.0


In [57]:
# mutate: SELECT ... AS new_column
stock_df["ticker"] = stock_df["ticker_name"].str.split(expand=True)[0]
stock_df["company_name"] = stock_df["ticker_name"].str.split(expand=True)[1]
# Series.str.split(pat=None, n=-1, expand=False)
stock_df.head()

Unnamed: 0,ticker_name,price,volume,mkt_value,ticker,company_name
0,3008 大立光,4085.0,336,1374160000.0,3008,大立光
1,6409 旭隼,605.0,4,2430000.0,6409,旭隼
2,2207 和泰車,495.5,643,321040000.0,2207,和泰車
3,5269 祥碩,472.5,584,273990000.0,5269,祥碩
4,6415 矽力-KY,463.5,237,110970000.0,6415,矽力-KY


In [58]:
stock_df.dtypes

ticker_name      object
price           float64
volume            int64
mkt_value       float64
ticker           object
company_name     object
dtype: object

In [59]:
stock_df = stock_df[["ticker", "company_name", "price", "volume", "mkt_value"]]
stock_df.head()

Unnamed: 0,ticker,company_name,price,volume,mkt_value
0,3008,大立光,4085.0,336,1374160000.0
1,6409,旭隼,605.0,4,2430000.0
2,2207,和泰車,495.5,643,321040000.0
3,5269,祥碩,472.5,584,273990000.0
4,6415,矽力-KY,463.5,237,110970000.0


In [60]:
stock_df.drop("company_name",axis = 1).head()

Unnamed: 0,ticker,price,volume,mkt_value
0,3008,4085.0,336,1374160000.0
1,6409,605.0,4,2430000.0
2,2207,495.5,643,321040000.0
3,5269,472.5,584,273990000.0
4,6415,463.5,237,110970000.0


In [61]:
stock_df.head()

Unnamed: 0,ticker,company_name,price,volume,mkt_value
0,3008,大立光,4085.0,336,1374160000.0
1,6409,旭隼,605.0,4,2430000.0
2,2207,和泰車,495.5,643,321040000.0
3,5269,祥碩,472.5,584,273990000.0
4,6415,矽力-KY,463.5,237,110970000.0


In [62]:
stock_df["listed_in_ky"] = stock_df["company_name"].str.contains("KY")
stock_df.head()

Unnamed: 0,ticker,company_name,price,volume,mkt_value,listed_in_ky
0,3008,大立光,4085.0,336,1374160000.0,False
1,6409,旭隼,605.0,4,2430000.0,False
2,2207,和泰車,495.5,643,321040000.0,False
3,5269,祥碩,472.5,584,273990000.0,False
4,6415,矽力-KY,463.5,237,110970000.0,True


In [63]:
stock_df.sort_values(["listed_in_ky", "price"], ascending = [True, False]).head()

Unnamed: 0,ticker,company_name,price,volume,mkt_value,listed_in_ky
0,3008,大立光,4085.0,336,1374160000.0,False
1,6409,旭隼,605.0,4,2430000.0,False
2,2207,和泰車,495.5,643,321040000.0,False
3,5269,祥碩,472.5,584,273990000.0,False
5,1476,儒鴻,423.5,390,164630000.0,False


In [64]:
# 解決跳號問題，並把舊的 index 刪掉
stock_df.sort_values(["listed_in_ky", "price"], ascending = [True, False]).reset_index(drop = True).head()

Unnamed: 0,ticker,company_name,price,volume,mkt_value,listed_in_ky
0,3008,大立光,4085.0,336,1374160000.0,False
1,6409,旭隼,605.0,4,2430000.0,False
2,2207,和泰車,495.5,643,321040000.0,False
3,5269,祥碩,472.5,584,273990000.0,False
4,1476,儒鴻,423.5,390,164630000.0,False


In [65]:
new_stock_df = stock_df.sort_values(["listed_in_ky", "price"], ascending = [True, False])
new_stock_df.head()

Unnamed: 0,ticker,company_name,price,volume,mkt_value,listed_in_ky
0,3008,大立光,4085.0,336,1374160000.0,False
1,6409,旭隼,605.0,4,2430000.0,False
2,2207,和泰車,495.5,643,321040000.0,False
3,5269,祥碩,472.5,584,273990000.0,False
5,1476,儒鴻,423.5,390,164630000.0,False


In [66]:
new_stock_df.loc[range(5), ]

Unnamed: 0,ticker,company_name,price,volume,mkt_value,listed_in_ky
0,3008,大立光,4085.0,336,1374160000.0,False
1,6409,旭隼,605.0,4,2430000.0,False
2,2207,和泰車,495.5,643,321040000.0,False
3,5269,祥碩,472.5,584,273990000.0,False
4,6415,矽力-KY,463.5,237,110970000.0,True


In [67]:
new_stock_df.iloc[range(5), ]

Unnamed: 0,ticker,company_name,price,volume,mkt_value,listed_in_ky
0,3008,大立光,4085.0,336,1374160000.0,False
1,6409,旭隼,605.0,4,2430000.0,False
2,2207,和泰車,495.5,643,321040000.0,False
3,5269,祥碩,472.5,584,273990000.0,False
5,1476,儒鴻,423.5,390,164630000.0,False


In [68]:
new_stock_df.head()[["company_name", "price"]]

Unnamed: 0,company_name,price
0,大立光,4085.0
1,旭隼,605.0
2,和泰車,495.5
3,祥碩,472.5
5,儒鴻,423.5


In [69]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

r = requests.get("https://tw.stock.yahoo.com/d/i/rank.php?t=pri&e=tse&n=100")
html_str = r.text
soup = BeautifulSoup(html_str)
table_contents = soup.find_all("td")[2].find_all("td")[3:]
n_data = len(table_contents)
ticker_names = [table_contents[i].text for i in range(n_data) if i % 10 == 1]
prices = [float(table_contents[i].text) for i in range(n_data) if i % 10 == 2]
volumes = [int(table_contents[i].text.replace(",", "")) for i in range(n_data) if i % 10 == 8]
mkt_values = [float(table_contents[i].text)*100000000 for i in range(n_data) if i % 10 == 9]
stock_df = pd.DataFrame()
stock_df["ticker_name"] = ticker_names
stock_df["price"] = prices
stock_df["volume"] = volumes
stock_df["mkt_value"] = mkt_values
stock_df["ticker"] = stock_df["ticker_name"].str.split(expand=True)[0]
stock_df["company_name"] = stock_df["ticker_name"].str.split(expand=True)[1]
stock_df["listed_in_ky"] = stock_df["company_name"].str.contains("KY")
stock_df = stock_df[["ticker", "company_name", "listed_in_ky", "price", "volume", "mkt_value"]]
stock_df.head()

Unnamed: 0,ticker,company_name,listed_in_ky,price,volume,mkt_value
0,3008,大立光,False,4085.0,336,1374160000.0
1,6409,旭隼,False,605.0,4,2430000.0
2,2207,和泰車,False,495.5,643,321040000.0
3,5269,祥碩,False,472.5,584,273990000.0
4,6415,矽力-KY,True,463.5,237,110970000.0


In [70]:
# summarise: SUM(), AVG(), DISTINCT(), COUNT(), MAX(), MIN()...
print(stock_df["price"].max())
print(stock_df["price"].idxmin())
print(stock_df["mkt_value"].max())
print(stock_df["mkt_value"].idxmax())
print(stock_df.loc[6, :])

4085.0
99
5797360000.0
22
ticker               3563
company_name           牧德
listed_in_ky        False
price                 374
volume                638
mkt_value       2.393e+08
Name: 6, dtype: object


In [71]:
stock_df["listed_in_ky"].unique()

array([False,  True])

In [72]:
stock_df["price"].unique()

array([4085. ,  605. ,  495.5,  472.5,  463.5,  423.5,  374. ,  370.5,
        363.5,  337. ,  309. ,  308. ,  288.5,  280. ,  275.5,  275. ,
        264.5,  262.5,  256.5,  254. ,  248. ,  245. ,  240. ,  233.5,
        231. ,  229.5,  228.5,  221. ,  218.5,  218. ,  217.5,  213.5,
        210. ,  207.5,  197. ,  186. ,  181.5,  180.5,  180. ,  175.5,
        171.5,  170. ,  168.5,  166.5,  164. ,  163. ,  162.5,  162. ,
        161.5,  161. ,  159.5,  157. ,  150.5,  148.5,  147.5,  143.5,
        143. ,  141.5,  140. ,  133. ,  131. ,  129.5,  129. ,  123. ,
        120.5,  119.5,  117. ,  115.5,  115. ,  114. ,  113.5,  111.5,
        110. ,  109. ,  107.5,  105.5,  105. ,  104.5,  103. ,  102.5,
        100.5,  100. ,   97.8,   97.6,   96.6,   96.3,   96.2,   96. ,
         95.3,   94.8,   94.2,   94.1])

In [73]:
stock_df["price"].unique().size

92

In [74]:
stock_df[stock_df["listed_in_ky"]].shape[0]

20

In [75]:
stock_df[~stock_df["listed_in_ky"]].shape[0]

80

In [76]:
# group by: GROUP BY
stock_df.groupby("listed_in_ky")["company_name"].count()

listed_in_ky
False    80
True     20
Name: company_name, dtype: int64

In [77]:
stock_df.groupby("listed_in_ky")["price"].max()

listed_in_ky
False    4085.0
True      463.5
Name: price, dtype: float64

In [78]:
stock_df.groupby("listed_in_ky")["price"].min()

listed_in_ky
False    94.1
True     96.2
Name: price, dtype: float64

In [79]:
# 移除重複值
rgbs = ["Blue", "Blue", "Red", "Red", "Green", "Green"]
set(rgbs)

{'Blue', 'Green', 'Red'}

In [80]:
import numpy as np

rgb_arr = np.array(rgbs)
np.unique(rgb_arr)

array(['Blue', 'Green', 'Red'], dtype='<U5')