# Pandas tutorial for Asia University

## Pandas introduction
- 使用Python 進行資料分析中幾乎必備的套件
- 提供DataFrame資料格式，可視化與操作簡易度高
- 用於資料處理、簡易統計分析與作圖

In [1]:
# 載入套件
import pandas as pd

## 建立/讀取資料

### 1. 自行建立DataFrame資料

In [2]:
data_dict = {'ID':[1,2,3], 'Name':['A','B','C']}

dat = pd.DataFrame(data_dict)
print(type(dat))
dat

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,ID,Name
0,1,A
1,2,B
2,3,C


In [3]:
data_list = [[1,'A'],[2,'B']] # list中每個元素為一筆資料的所有欄位數值

dat = pd.DataFrame(data_list, columns = ['ID','Name'])
dat

Unnamed: 0,ID,Name
0,1,A
1,2,B


### 2.將JSON格式轉為DataFrame

In [4]:
data_json = '{"columns":["col 1","col 2"],  "index":["row 1","row 2"],  "data":[["a","b"],["c","d"]]}'

dat = pd.read_json(data_json)
dat

Unnamed: 0,columns,index,data
0,col 1,row 1,"[a, b]"
1,col 2,row 2,"[c, d]"


### 3. 從檔案讀取csv檔

In [7]:
dat = pd.read_csv('train.csv')
dat.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### 動手試試看

請讀取： Pokemon_g1.csv檔案

[data source](https://www.kaggle.com/abcsds/pokemon)

---

## 觀察資料

In [2]:
dat = pd.read_csv('Pokemon/Pokemon_g1.csv', encoding = 'Windows-1252')

In [16]:
dat.shape

(166, 11)

In [11]:
dat.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80
4,4,Charmander,Fire,,309,39,52,43,60,50,65


In [12]:
dat.index

RangeIndex(start=0, stop=166, step=1)

In [13]:
dat.columns

Index(['#', 'Name', 'Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense',
       'Sp. Atk', 'Sp. Def', 'Speed'],
      dtype='object')

In [17]:
dat.dtypes

#           int64
Name       object
Type 1     object
Type 2     object
Total       int64
HP          int64
Attack      int64
Defense     int64
Sp. Atk     int64
Sp. Def     int64
Speed       int64
dtype: object

In [14]:
dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166 entries, 0 to 165
Data columns (total 11 columns):
#          166 non-null int64
Name       166 non-null object
Type 1     165 non-null object
Type 2     79 non-null object
Total      166 non-null int64
HP         166 non-null int64
Attack     166 non-null int64
Defense    166 non-null int64
Sp. Atk    166 non-null int64
Sp. Def    166 non-null int64
Speed      166 non-null int64
dtypes: int64(8), object(3)
memory usage: 14.3+ KB


In [15]:
dat.describe()

Unnamed: 0,#,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
count,166.0,166.0,166.0,166.0,166.0,166.0,166.0,166.0
mean,75.819277,425.445783,65.885542,76.536145,70.861446,71.819277,69.150602,72.289157
std,45.153246,119.001528,28.119962,30.838709,28.641336,34.439106,25.447895,30.141899
min,1.0,46.0,10.0,5.0,5.0,15.0,20.0,1.0
25%,36.25,325.0,48.5,52.75,50.0,45.0,50.0,50.0
50%,76.5,436.5,62.0,75.0,66.0,65.0,70.0,70.0
75%,115.0,500.0,80.0,95.0,85.0,95.0,85.0,92.25
max,151.0,780.0,250.0,190.0,180.0,194.0,130.0,150.0


---

# Change data type

In [3]:
dat.dtypes

#           int64
Name       object
Type 1     object
Type 2     object
Total       int64
HP          int64
Attack      int64
Defense     int64
Sp. Atk     int64
Sp. Def     int64
Speed       int64
dtype: object

In [6]:
dat['#'].astype('str')[0]

'1'

---

# Missing value

In [19]:
dat['Type 2'].isna().head()

0    False
1    False
2    False
3    False
4     True
Name: Type 2, dtype: bool

In [25]:
print(dat['Type 2'][4])
print(type(dat['Type 2'][4]))
dat.head()

nan
<class 'float'>


Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80
4,4,Charmander,Fire,,309,39,52,43,60,50,65


In [26]:
dat_ = dat.copy()
dat_['Type 2'] = dat_['Type 2'].fillna('Non') # 可以直接用 dat_['Type 2'].fillna('None', inplace = True) 取代

print(dat_['Type 2'][4])
print(type(dat_['Type 2'][4]))

Non
<class 'str'>
