<font size=17>Pandas 学习</font> 

# 数据与结构 Series

## Series
一个数列有<br>
name<br>index<br>value


In [1]:
# 引入pandas
import pandas as pd
import numpy as np

### 构建Series

#### 不同构建Series的方法

In [2]:
#list 构建
pd.Series(range(10,20))

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64

In [3]:
# np.array 构建
pd.Series(np.random.rand(5))

0    0.004080
1    0.191747
2    0.044686
3    0.742520
4    0.123080
dtype: float64

In [4]:
#字典构建
d = {'a' : 3,'b' :8, 'c' : 7}
pd.Series(d)

a    3
b    8
c    7
dtype: int64

#### 构建时候指定索引

In [5]:
pd.Series(np.random.rand(5), index = ['a','b','c','d','e'])

a    0.018852
b    0.239506
c    0.681131
d    0.543540
e    0.482721
dtype: float64

### 数据预览

In [6]:
# 创建一个object
ser_obj = pd.Series(np.random.rand(100))

#### head()
head(sth)<br>
tail(sth)
sth = 需要浏览的行数

In [7]:
ser_obj.head(10)

0    0.381542
1    0.276199
2    0.077552
3    0.123350
4    0.306590
5    0.167666
6    0.957597
7    0.526043
8    0.736365
9    0.077445
dtype: float64

In [8]:
ser_obj.tail(5)

95    0.124510
96    0.587640
97    0.889619
98    0.551502
99    0.809366
dtype: float64

#### 获取索引

In [9]:
ser_obj.index

RangeIndex(start=0, stop=100, step=1)

In [10]:
ser_obj.values

array([0.38154189, 0.27619936, 0.07755173, 0.12334967, 0.30659019,
       0.16766555, 0.9575968 , 0.52604315, 0.73636544, 0.07744483,
       0.54107184, 0.8392352 , 0.68450141, 0.32043312, 0.65997867,
       0.33157566, 0.80290738, 0.38828806, 0.89216018, 0.69644638,
       0.1668053 , 0.66603102, 0.71769444, 0.93793638, 0.37619893,
       0.86585365, 0.73525919, 0.99520836, 0.76062578, 0.80346108,
       0.22297253, 0.55468586, 0.93684865, 0.45894065, 0.82702633,
       0.07658282, 0.45017969, 0.15152949, 0.49824436, 0.17202465,
       0.63623935, 0.50576472, 0.06041632, 0.74243177, 0.55804356,
       0.09963399, 0.24221247, 0.87869831, 0.70722429, 0.14648702,
       0.21669111, 0.47550499, 0.69097835, 0.12987254, 0.84135429,
       0.87630501, 0.66328284, 0.09770164, 0.36948052, 0.41083193,
       0.85518558, 0.99268638, 0.04956746, 0.63867511, 0.9043176 ,
       0.83960831, 0.1570925 , 0.99748922, 0.41849355, 0.4050944 ,
       0.15374593, 0.01077383, 0.69833196, 0.70459006, 0.44005

#### name 属性

In [11]:
# 构建一个Series 有名字
ser_obj = pd.Series(np.random.rand(100),name='rand_num')

In [12]:
ser_obj.head()

0    0.689932
1    0.841417
2    0.500986
3    0.644449
4    0.339937
Name: rand_num, dtype: float64

In [13]:
# 给索引去一个名字
ser_obj.index.name = 'index'

In [14]:
ser_obj.head()

index
0    0.689932
1    0.841417
2    0.500986
3    0.644449
4    0.339937
Name: rand_num, dtype: float64

### 通过索引获得数据

#### 通过索引获得对应的value

In [15]:
# 创建一个Series
ser_obj2 = pd.Series(np.random.rand(5), index = ['a','b','c','d','e'])

In [16]:
# 通过索引获得对应的value
ser_obj2['b']

0.424677310559441

In [17]:
ser_obj2.loc['b']

0.424677310559441

#### Series 可以被看成定长有序的字典， 通过 in 来判断数据是否存在**

In [18]:
'a' in ser_obj2

True

#### iloc
**通过位置获得数据而不是索引**

In [19]:
ser_obj2[0]

0.9241329862416869

In [20]:
ser_obj.iloc[0]

0.6899323544517559

### 处理缺失的数据

In [21]:
countries = ['China','US','Japan',None]
pd.Series(countries)

0    China
1       US
2    Japan
3     None
dtype: object

In [22]:
numbers = [4,5,6,None]
pd.Series(numbers)

0    4.0
1    5.0
2    6.0
3    NaN
dtype: float64

# 数据与结构 DataFrame

类似于多维数组<br>
每列数据可以是不同的类型<br>
索引包含**行索引**和**列索引**


## DataFrame

In [23]:
# 引入pandas 和 numpy 包
import pandas as pd
import numpy as np

### 构建DataFrame
(1)通过numpy.array构建<br>
(2)通过python dictionary 构建

In [24]:
array = np.random.randn(5,4)

In [25]:
df_obj = pd.DataFrame(array)
df_obj

Unnamed: 0,0,1,2,3
0,0.542818,1.079415,-0.438936,0.693506
1,0.467106,1.988644,-1.296429,-0.832734
2,0.136454,0.364781,-0.955587,-1.506235
3,0.491959,0.024313,-0.624217,-1.182561
4,0.770325,0.337392,0.575021,-0.542268


In [26]:
# 通过dict
dict_data = {'a':1,
             'b': pd.Timestamp('20190101'),
             'c': pd.Series(1, index = list(range(4)),dtype ='float32'),
             'd':np.array([3]*4,dtype = 'int32'), # 3 重复了4次
             'e':['Python','Java','C++','C#'],
             'f':'something'
    
}

In [27]:
dict_data

{'a': 1, 'b': Timestamp('2019-01-01 00:00:00'), 'c': 0    1.0
 1    1.0
 2    1.0
 3    1.0
 dtype: float32, 'd': array([3, 3, 3, 3], dtype=int32), 'e': ['Python',
  'Java',
  'C++',
  'C#'], 'f': 'something'}

In [28]:
df_obj2 = pd.DataFrame(dict_data)
df_obj2

Unnamed: 0,a,b,c,d,e,f
0,1,2019-01-01,1.0,3,Python,something
1,1,2019-01-01,1.0,3,Java,something
2,1,2019-01-01,1.0,3,C++,something
3,1,2019-01-01,1.0,3,C#,something


**数据大小不一样，可以生成DataFrame**<br>
哪个数据小，会被扩展成为最大的<br>
**DataFrame 的数据类型，是按照列来形成的*

### 获得DataFrame的属性

#### 获得列名
与Series一样

In [29]:
df_obj2.columns

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')

#### 获得索引

In [30]:
df_obj2.index

Int64Index([0, 1, 2, 3], dtype='int64')

#### 获得值

##### 获得所有的值

In [31]:
df_obj2.values

array([[1, Timestamp('2019-01-01 00:00:00'), 1.0, 3, 'Python',
        'something'],
       [1, Timestamp('2019-01-01 00:00:00'), 1.0, 3, 'Java', 'something'],
       [1, Timestamp('2019-01-01 00:00:00'), 1.0, 3, 'C++', 'something'],
       [1, Timestamp('2019-01-01 00:00:00'), 1.0, 3, 'C#', 'something']],
      dtype=object)

##### 获得某一列的值

In [32]:
df_obj2['e']

0    Python
1      Java
2       C++
3        C#
Name: e, dtype: object

In [33]:
df_obj2.e #如果列名有空格或者其他字符，可能会返回错误

0    Python
1      Java
2       C++
3        C#
Name: e, dtype: object

In [34]:
type(df_obj2.e)

pandas.core.series.Series

#### head & tail

In [35]:
df_obj2.head(3)

Unnamed: 0,a,b,c,d,e,f
0,1,2019-01-01,1.0,3,Python,something
1,1,2019-01-01,1.0,3,Java,something
2,1,2019-01-01,1.0,3,C++,something


In [36]:
df_obj2.tail(3)

Unnamed: 0,a,b,c,d,e,f
1,1,2019-01-01,1.0,3,Java,something
2,1,2019-01-01,1.0,3,C++,something
3,1,2019-01-01,1.0,3,C#,something


## 数据清洗

### 修改数据

#### 增加数据列

In [37]:
df_obj2['g'] = range(4)

In [38]:
df_obj2

Unnamed: 0,a,b,c,d,e,f,g
0,1,2019-01-01,1.0,3,Python,something,0
1,1,2019-01-01,1.0,3,Java,something,1
2,1,2019-01-01,1.0,3,C++,something,2
3,1,2019-01-01,1.0,3,C#,something,3


#### 删除数据


##### drop 返回值是操作结果，原数据不会改变

In [39]:
df_obj2.drop(columns=['b','c']) 

Unnamed: 0,a,d,e,f,g
0,1,3,Python,something,0
1,1,3,Java,something,1
2,1,3,C++,something,2
3,1,3,C#,something,3


In [40]:
df_obj2

Unnamed: 0,a,b,c,d,e,f,g
0,1,2019-01-01,1.0,3,Python,something,0
1,1,2019-01-01,1.0,3,Java,something,1
2,1,2019-01-01,1.0,3,C++,something,2
3,1,2019-01-01,1.0,3,C#,something,3


##### del 对原数据进行修改

In [41]:
del df_obj2['a']
df_obj2

Unnamed: 0,b,c,d,e,f,g
0,2019-01-01,1.0,3,Python,something,0
1,2019-01-01,1.0,3,Java,something,1
2,2019-01-01,1.0,3,C++,something,2
3,2019-01-01,1.0,3,C#,something,3


### 处理缺失数据

In [126]:
#读取缺失文件
df = pd.read_csv('log.csv')
df.head()

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,


#### 判断是否存在缺失值 isnull()

In [127]:
df.isnull()

Unnamed: 0,time,user,video,playback position,paused,volume
0,False,False,False,False,False,False
1,False,False,False,False,True,True
2,False,False,False,False,True,True
3,False,False,False,False,True,True
4,False,False,False,False,True,True
5,False,False,False,False,True,True
6,False,False,False,False,True,True
7,False,False,False,False,True,True
8,False,False,False,False,True,True
9,False,False,False,False,True,True


In [129]:
#结合all()来使用isnull（）
df.isnull().any(axis = 0)

time                 False
user                 False
video                False
playback position    False
paused                True
volume                True
dtype: bool

any（）可以按照列或者行 <br>
**axis = 0**返回每一列是否有空值<br>
**axis = 1**返回每一行是否有空值

#### 丢弃缺失数据，注意inplace参数

In [131]:
df.dropna()

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
13,1469974424,sue,advanced.html,23,False,10.0
24,1469977424,bob,intro.html,1,True,10.0


In [132]:
df.dropna(axis = 1)

Unnamed: 0,time,user,video,playback position
0,1469974424,cheryl,intro.html,5
1,1469974454,cheryl,intro.html,6
2,1469974544,cheryl,intro.html,9
3,1469974574,cheryl,intro.html,10
4,1469977514,bob,intro.html,1
5,1469977544,bob,intro.html,1
6,1469977574,bob,intro.html,1
7,1469977604,bob,intro.html,1
8,1469974604,cheryl,intro.html,11
9,1469974694,cheryl,intro.html,14


#### 选择特定某一列dropna

In [133]:
df.dropna(subset = ['volume'])

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
13,1469974424,sue,advanced.html,23,False,10.0
16,1469974654,sue,advanced.html,28,,5.0
24,1469977424,bob,intro.html,1,True,10.0


### 填充缺失数据
fillna() <br>
df.ffill() 按照前面数据填充 <br>
df.bfill() 按照后面数据填充 <br>

In [137]:
df.ffill()

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,False,10.0
2,1469974544,cheryl,intro.html,9,False,10.0
3,1469974574,cheryl,intro.html,10,False,10.0
4,1469977514,bob,intro.html,1,False,10.0
5,1469977544,bob,intro.html,1,False,10.0
6,1469977574,bob,intro.html,1,False,10.0
7,1469977604,bob,intro.html,1,False,10.0
8,1469974604,cheryl,intro.html,11,False,10.0
9,1469974694,cheryl,intro.html,14,False,10.0


In [135]:
df.bfill()

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,False,10.0
2,1469974544,cheryl,intro.html,9,False,10.0
3,1469974574,cheryl,intro.html,10,False,10.0
4,1469977514,bob,intro.html,1,False,10.0
5,1469977544,bob,intro.html,1,False,10.0
6,1469977574,bob,intro.html,1,False,10.0
7,1469977604,bob,intro.html,1,False,10.0
8,1469974604,cheryl,intro.html,11,False,10.0
9,1469974694,cheryl,intro.html,14,False,10.0


### 处理重复数据

In [138]:
data = pd.DataFrame({'age':[28,31,27,28],
                    'gender':['M','M','M','F'],
                    'surname':['Liu','Li','Chen','Liu']})

data

Unnamed: 0,age,gender,surname
0,28,M,Liu
1,31,M,Li
2,27,M,Chen
3,28,F,Liu


In [139]:
data.duplicated()

0    False
1    False
2    False
3    False
dtype: bool

In [142]:
data.duplicated(subset=['age','surname'])

0    False
1    False
2    False
3     True
dtype: bool

##### 删除重复值

duplicated(subset) --返回布尔型series表示每行是否为重复行

drop_duplicates(subset,keep)

    默认判断全部列，可以通过subset参数来指定列 <br>
    keep，默认保留第一次出现的数据 (keep = 'first' or 'last)

In [143]:
data.drop_duplicates(subset =['age','surname'],keep = 'last')

Unnamed: 0,age,gender,surname
1,31,M,Li
2,27,M,Chen
3,28,F,Liu


### 替换数据
df.replace(to_replace)，参数to_replace为可以是：

数值，字符串：需要替换的值，新的值，比如to_replace=23，45，表示将df数据中的23替换为45

列表：第一个列表中的元素是需要被替换掉的值，第二个列表中的元素是新的值。两个列表需要一一对应。

字典，键是需要被替换掉的值，值为新的值

**两个参数**
（需要换掉的值，新的值）

In [144]:
data

Unnamed: 0,age,gender,surname
0,28,M,Liu
1,31,M,Li
2,27,M,Chen
3,28,F,Liu


In [146]:
data.replace(28,29, inplace = True)
data

Unnamed: 0,age,gender,surname
0,29,M,Liu
1,31,M,Li
2,27,M,Chen
3,29,F,Liu


In [147]:
data.replace([29,31,27,29],[28,31,27,28], inplace = True)
data

Unnamed: 0,age,gender,surname
0,28,M,Liu
1,31,M,Li
2,27,M,Chen
3,28,F,Liu


In [148]:
data.replace({28:29,31:40}, inplace = True)
data

Unnamed: 0,age,gender,surname
0,29,M,Liu
1,40,M,Li
2,27,M,Chen
3,29,F,Liu


有DataFrame结构的data数据，对df_obj中的A到Z之间的数据替换为99

**“A-Z”表示A到Z的所有大写字母，采用了正则表达式，regex表示是否采用正则表达式。**

In [149]:
data.replace('[A-Z]',99,regex=True ,inplace=True)
data

Unnamed: 0,age,gender,surname
0,29,99,99
1,40,99,99
2,27,99,99
3,29,99,99


# 数据与结构 Index

索引**不可被改变**<br>
<br>
***索引的种类***<br>(1)index<br>(2)int64index以整形作为index<br>(3)MultiIndex层级索引， 年份与月份<br>(4)DatatimeIndex 时间序列<br>

In [42]:
# 构建Series
ser_obj = pd.Series(range(10,20,2),index = ['a','b','c','d','e'])
ser_obj

a    10
b    12
c    14
d    16
e    18
dtype: int64

#### 查看index

In [43]:
ser_obj.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [44]:
ser_obj.index[2] #index 是无法单个修改

'c'

#### 重置索引
reset_index()

In [45]:
ser_obj.reset_index(drop = True)
# 返回修改数据，而非在原数据上修改

0    10
1    12
2    14
3    16
4    18
dtype: int64

In [46]:
ser_obj.reset_index(drop = False)
# drop = False 会保留index 这个column

Unnamed: 0,index,0
0,a,10
1,b,12
2,c,14
3,d,16
4,e,18


In [47]:
ser_obj

a    10
b    12
c    14
d    16
e    18
dtype: int64

In [48]:
ser_obj = pd.Series([2,4,5,9,1,6])

ser_obj.index = ['a','b','c','d','e','f']

In [49]:
ser_obj[ser_obj > 3]

b    4
c    5
d    9
f    6
dtype: int64

In [50]:
ser_obj['a':'c']

a    2
b    4
c    5
dtype: int64

## DataFrame索引操作

In [51]:
# 构建dataframe
country1 = pd.Series({'Name':'China','Language':'Chinese','Area':'9.597M km2','Happiness Rank': 79})
country2 = pd.Series({'Name':'U.S','Language':'English','Area':'9.834M km2','Happiness Rank': 14})
country3 = pd.Series({'Name':'Australia','Language':'English','Area':'7.692M km2','Happiness Rank': 9})

df = pd.DataFrame([country1,country2,country3], index = ['CH','US','AU'])

In [52]:
df

Unnamed: 0,Name,Language,Area,Happiness Rank
CH,China,Chinese,9.597M km2,79
US,U.S,English,9.834M km2,14
AU,Australia,English,7.692M km2,9


### 列索引
df_obj['label'] <br>
**不连续列索引** <br>
df_obj[['lable1','lable2']] <br>
**连续索引** <br>
df_obj[['lable1':'lable3']] # lable 必须是有顺序的数字

In [53]:
df['Name']

CH        China
US          U.S
AU    Australia
Name: Name, dtype: object

In [54]:
df[['Area','Name']]

Unnamed: 0,Area,Name
CH,9.597M km2,China
US,9.834M km2,U.S
AU,7.692M km2,Australia


### 行索引

df_obj.loc[&emsp;] &emsp; #<br> 
df_obj.iloc[&emsp;] &emsp; 

In [55]:
df

Unnamed: 0,Name,Language,Area,Happiness Rank
CH,China,Chinese,9.597M km2,79
US,U.S,English,9.834M km2,14
AU,Australia,English,7.692M km2,9


In [56]:
df.loc['CH'] 

Name                   China
Language             Chinese
Area              9.597M km2
Happiness Rank            79
Name: CH, dtype: object

In [57]:
df.iloc[0]

Name                   China
Language             Chinese
Area              9.597M km2
Happiness Rank            79
Name: CH, dtype: object

### 先行后列

In [58]:
print(df.loc['CH']['Area'])
print(df.iloc[0]['Area'])

9.597M km2
9.597M km2


### 先列后行

In [59]:
print(df['Area']['CH'])
print(df['Area'].loc['CH'])
print(df['Area'].iloc[0])

9.597M km2
9.597M km2
9.597M km2


### inplace 参数
默认 inplace = False, 表示将操作后的结果进行返回，对原始数据**不会产生影响** <br>
inplace = True, 没有返回值，在原始数据上进行操作，对原数据**会产生影响**

In [60]:
#复制一个 Data Frame
df2 = df.copy()

In [61]:
df.drop('Area',axis = 1)

Unnamed: 0,Name,Language,Happiness Rank
CH,China,Chinese,79
US,U.S,English,14
AU,Australia,English,9


In [62]:
df

Unnamed: 0,Name,Language,Area,Happiness Rank
CH,China,Chinese,9.597M km2,79
US,U.S,English,9.834M km2,14
AU,Australia,English,7.692M km2,9


**上面对于df 没有修改，只是返回了一个新的DF，需要一个新的变量名字去接收**

In [63]:
df.drop('Area',axis = 1,inplace = True)

In [64]:
df

Unnamed: 0,Name,Language,Happiness Rank
CH,China,Chinese,79
US,U.S,English,14
AU,Australia,English,9


**inplace = True, 没有返回值，在原始数据上进行操作，对原数据会产生影响** <br>
就算有新的变量去接收，也会对原值进行修改

## 布尔遮罩
条件索引 <br>
设置布尔值，再放入df[]里面

### 条件索引（值）

In [65]:
df[df['Happiness Rank'] <= 20]

Unnamed: 0,Name,Language,Happiness Rank
US,U.S,English,14
AU,Australia,English,9


### 条件索引（字符包含）

In [66]:
# 生成布尔mask
df['Language'].str.contains('English')
condition = df['Language'].str.contains('English')

In [67]:
#返回值
df[condition]

Unnamed: 0,Name,Language,Happiness Rank
US,U.S,English,14
AU,Australia,English,9


In [68]:
df_obj = pd.DataFrame([[1,2,3],[2,1,1],[4,3,1],[5,1,2]],columns=['a','b','c'])
df_obj

Unnamed: 0,a,b,c
0,1,2,3
1,2,1,1
2,4,3,1
3,5,1,2


In [69]:
df_obj.loc[df_obj.loc[:,"c"]>1,:]

Unnamed: 0,a,b,c
0,1,2,3
3,5,1,2


## 排序
按索引排序，sort_index() <br>
-按值排序，sort_values(by, ascending) <br>
    -按**单列的值**排序<br>
    -by = ‘label’<br>
    -ascending： True 升序， False 降序

### 按索引排序

In [116]:
# 读取文件生成Dataframe
df = pd.read_csv("2016_happiness.csv",usecols = ['Country','Region','Happiness Rank','Happiness Score'],index_col = 'Happiness Rank')
df.head()

Unnamed: 0_level_0,Country,Region,Happiness Score
Happiness Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Denmark,Western Europe,7.526
2,Switzerland,Western Europe,7.509
3,Iceland,Western Europe,7.501
4,Norway,Western Europe,7.498
5,Finland,Western Europe,7.413


In [117]:
df.sort_index().head(10)
# 默认升序排列

Unnamed: 0_level_0,Country,Region,Happiness Score
Happiness Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Denmark,Western Europe,7.526
2,Switzerland,Western Europe,7.509
3,Iceland,Western Europe,7.501
4,Norway,Western Europe,7.498
5,Finland,Western Europe,7.413
6,Canada,North America,7.404
7,Netherlands,Western Europe,7.339
8,New Zealand,Australia and New Zealand,7.334
9,Australia,Australia and New Zealand,7.313
10,Sweden,Western Europe,7.291


In [118]:
df.sort_index(ascending = False).head(10)
#降序排列

Unnamed: 0_level_0,Country,Region,Happiness Score
Happiness Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
157,Burundi,Sub-Saharan Africa,2.905
156,Syria,Middle East and Northern Africa,3.069
155,Togo,Sub-Saharan Africa,3.303
154,Afghanistan,Southern Asia,3.36
153,Benin,Sub-Saharan Africa,3.484
152,Rwanda,Sub-Saharan Africa,3.515
151,Guinea,Sub-Saharan Africa,3.607
150,Liberia,Sub-Saharan Africa,3.622
149,Tanzania,Sub-Saharan Africa,3.666
148,Madagascar,Sub-Saharan Africa,3.695


### 按列名排序

In [121]:
df.sort_index(axis = 1, ascending = True).head()

Unnamed: 0_level_0,Country,Happiness Score,Region
Happiness Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Denmark,7.526,Western Europe
2,Switzerland,7.509,Western Europe
3,Iceland,7.501,Western Europe
4,Norway,7.498,Western Europe
5,Finland,7.413,Western Europe


### 按多列的值排序
by = []

In [124]:
df = df.sort_index(axis = 1)
df.sort_values(by = ['Region','Country'],ascending = [True,False]).head(10)

Unnamed: 0_level_0,Country,Happiness Score,Region
Happiness Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,New Zealand,7.334,Australia and New Zealand
9,Australia,7.313,Australia and New Zealand
49,Uzbekistan,5.987,Central and Eastern Europe
123,Ukraine,4.324,Central and Eastern Europe
65,Turkmenistan,5.658,Central and Eastern Europe
100,Tajikistan,4.996,Central and Eastern Europe
63,Slovenia,5.768,Central and Eastern Europe
45,Slovakia,6.078,Central and Eastern Europe
86,Serbia,5.177,Central and Eastern Europe
56,Russia,5.856,Central and Eastern Europe


# 运算与对齐

## Series 运算
pandas 可以对不同索引的对象进行算术计算 <br>
索引与数据的对应关系仍保持在数组运算的结果中 <br>
如果没对齐的位置补NaN

In [70]:
ser1 = pd.Series(range(10))
ser2 = pd.Series(range(5))
ser1

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [71]:
ser1 + ser2

0    0.0
1    2.0
2    4.0
3    6.0
4    8.0
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
dtype: float64

## DataFrame 运算
相同位置的数据进行计算**重合的地方计算**

### DataFrame间计算

In [72]:
df1 = pd.DataFrame(np.ones((3,3)),columns=['a','b','c'])
df1

Unnamed: 0,a,b,c
0,1.0,1.0,1.0
1,1.0,1.0,1.0
2,1.0,1.0,1.0


In [73]:
df2 = pd.DataFrame(np.ones((2,2)),columns=['a','b'])
df2

Unnamed: 0,a,b
0,1.0,1.0
1,1.0,1.0


In [74]:
df1 + df2

Unnamed: 0,a,b,c
0,2.0,2.0,
1,2.0,2.0,
2,,,


### Series 和 DataFrame 运算
Series 和 DataFrame 操作时， Series 被看作行数据（Index 被看作列）， 和 DataFrame中的每行数据进行计算

In [75]:
ser2 + df1
# 因为列不相同，不相加

Unnamed: 0,a,b,c,0,1,2,3,4
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,


In [76]:
ser3 = pd.Series([1,2,3],index = ['a','b','c'])
ser3

a    1
b    2
c    3
dtype: int64

In [77]:
df1

Unnamed: 0,a,b,c
0,1.0,1.0,1.0
1,1.0,1.0,1.0
2,1.0,1.0,1.0


In [78]:
ser3 + df1

Unnamed: 0,a,b,c
0,2.0,3.0,4.0
1,2.0,3.0,4.0
2,2.0,3.0,4.0


In [79]:
ser1 = pd.Series([10,20,30,40,50,60])

df1=pd.DataFrame([[1,1,1]]*3)

In [80]:
df = ser1+df1
df

Unnamed: 0,0,1,2,3,4,5
0,11,21,31,,,
1,11,21,31,,,
2,11,21,31,,,


In [81]:
df = df1.add(ser1)
df

Unnamed: 0,0,1,2,3,4,5
0,11,21,31,,,
1,11,21,31,,,
2,11,21,31,,,


### 填补缺失值

#### fill_value 

In [82]:
ser1 = pd.Series([10,20,30,40,50,60])
ser2 = pd.Series([1,2,3])
ser1.add(ser2,fill_value=4)

0    11.0
1    22.0
2    33.0
3    44.0
4    54.0
5    64.0
dtype: float64

#### DataFrame.fillna 

In [83]:
df1=pd.DataFrame([ [1,1,1] ]*3)
df2=pd.DataFrame([ [2,2] ]*2)
df3 = df1 + df2
df3

Unnamed: 0,0,1,2
0,3.0,3.0,
1,3.0,3.0,
2,,,


In [84]:
df3 = df3.fillna(4)
df3

Unnamed: 0,0,1,2
0,3.0,3.0,4.0
1,3.0,3.0,4.0
2,4.0,4.0,4.0


# 函数应用

## map（）
将函数作用于一个**Series**的每一个元素<br>
类似于Python的高阶函数map()<br>

### Python 的高阶函数map()

In [85]:
lis= list(range(10))
lis

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [86]:
import math
result = map(math.sqrt,lis)
print(result)

<map object at 0x11d7586d0>


**返回一个object，操作未进行**，直到取出结果，才会执行操作

In [87]:
list(result)

[0.0,
 1.0,
 1.4142135623730951,
 1.7320508075688772,
 2.0,
 2.23606797749979,
 2.449489742783178,
 2.6457513110645907,
 2.8284271247461903,
 3.0]

### Pandas 中的map()函数
**只运用于Series**

In [88]:
ser = pd.Series(lis)
ser

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [89]:
ser.map(np.sqrt)

0    0.000000
1    1.000000
2    1.414214
3    1.732051
4    2.000000
5    2.236068
6    2.449490
7    2.645751
8    2.828427
9    3.000000
dtype: float64

### 自定义函数

#### 简短lambda

In [90]:
ser.map(lambda x:x **2 + 1)

0     1
1     2
2     5
3    10
4    17
5    26
6    37
7    50
8    65
9    82
dtype: int64

#### 复杂传入新的function name

In [91]:
# 定义一个函数
def func(x):
    x1 = x**3
    x2 = x1*2 +1
    return x2

In [92]:
ser.map(func)

0       1
1       3
2      17
3      55
4     129
5     251
6     433
7     687
8    1025
9    1459
dtype: int64

#### 拼接

In [93]:
ser = pd.Series(list("abcde"))
ser

0    a
1    b
2    c
3    d
4    e
dtype: object

##### 拼接方法一

In [94]:
ser.map(lambda x: ".".join([x,"com"]))

0    a.com
1    b.com
2    c.com
3    d.com
4    e.com
dtype: object

##### 拼接方法二

In [95]:
result = map(lambda x: x + ".com", ser)
list(result)

['a.com', 'b.com', 'c.com', 'd.com', 'e.com']

## apply() & applymap()
可以运用到DataFrame

apply <br>
axis = 0 每一列求，返回一行 <br>
axis = 1 每一行求，返回一列 <br>
applymap() <br>
对每一个数据处理

### apply()

In [96]:
df = pd.DataFrame(np.arange(10).reshape(5,2),columns = ['col_1','col_2'])
df

Unnamed: 0,col_1,col_2
0,0,1
1,2,3
2,4,5
3,6,7
4,8,9


In [97]:
df.apply(np.sum)

col_1    20
col_2    25
dtype: int64

In [98]:
df.apply(np.sum,axis = 1)

0     1
1     5
2     9
3    13
4    17
dtype: int64

### applymap()

In [99]:
df.applymap(lambda x : x**2)

Unnamed: 0,col_1,col_2
0,0,1
1,4,9
2,16,25
3,36,49
4,64,81


In [100]:
staff_df = pd.DataFrame([{'姓名': '张三', '部门': '研发部'},
                         {'姓名': '李四', '部门': '财务部'},
                         {'姓名': '赵六', '部门': '市场部'}])
staff_df

Unnamed: 0,姓名,部门
0,张三,研发部
1,李四,财务部
2,赵六,市场部


In [101]:
staff_df['姓名'].apply(lambda x:x[0])

0    张
1    李
2    赵
Name: 姓名, dtype: object

In [102]:
staff_df["姓"] = staff_df['姓名'].apply(lambda x:x[0])
staff_df

Unnamed: 0,姓名,部门,姓
0,张三,研发部,张
1,李四,财务部,李
2,赵六,市场部,赵


In [103]:
df = pd.DataFrame(np.arange(15).reshape(5,3), columns = ['a','b','c'])
df

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11
4,12,13,14


In [104]:
df.loc[2,:].apply(lambda x: x**2)

a    36
b    49
c    64
Name: 2, dtype: int64

# 文件读写

## 读文件
pd.read_csv(filepath,usecols,index_col)<br>
filepath: 文件路径 <br>
usecols: 指定需要读取的列（默认全部读取）<br>
index_col: 指定某列为索引列，默认会生成一列索引0，1，2...

In [106]:
df = pd.read_csv("2016_happiness.csv",usecols = ['Country','Region','Happiness Rank','Happiness Score'])
df.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score
0,Denmark,Western Europe,1,7.526
1,Switzerland,Western Europe,2,7.509
2,Iceland,Western Europe,3,7.501
3,Norway,Western Europe,4,7.498
4,Finland,Western Europe,5,7.413


In [108]:
df = pd.read_csv("2016_happiness.csv",usecols = ['Country','Region','Happiness Rank','Happiness Score'],index_col = 'Country')
df.head()

Unnamed: 0_level_0,Region,Happiness Rank,Happiness Score
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Denmark,Western Europe,1,7.526
Switzerland,Western Europe,2,7.509
Iceland,Western Europe,3,7.501
Norway,Western Europe,4,7.498
Finland,Western Europe,5,7.413


**country 是index**

## 快速查看数据基本信息
df.info()

In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 4 columns):
Country            157 non-null object
Region             157 non-null object
Happiness Rank     157 non-null int64
Happiness Score    157 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 5.0+ KB


## 保存数据
df.to_csv(filepath, index)<br>
filepath: 保存的路径 <br>
index： 是否将索引列保存，默认为True

In [109]:
df['int_score'] = df['Happiness Score'].apply(np.around)
df

Unnamed: 0_level_0,Region,Happiness Rank,Happiness Score,int_score
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Denmark,Western Europe,1,7.526,8.0
Switzerland,Western Europe,2,7.509,8.0
Iceland,Western Europe,3,7.501,8.0
Norway,Western Europe,4,7.498,7.0
Finland,Western Europe,5,7.413,7.0
...,...,...,...,...
Benin,Sub-Saharan Africa,153,3.484,3.0
Afghanistan,Southern Asia,154,3.360,3.0
Togo,Sub-Saharan Africa,155,3.303,3.0
Syria,Middle East and Northern Africa,156,3.069,3.0


In [113]:
df.to_csv('index.csv')
df.to_csv('no_index.csv',index = False)

In [112]:
df_index = pd.read_csv('index.csv')
df_index.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,int_score
0,Denmark,Western Europe,1,7.526,8.0
1,Switzerland,Western Europe,2,7.509,8.0
2,Iceland,Western Europe,3,7.501,8.0
3,Norway,Western Europe,4,7.498,7.0
4,Finland,Western Europe,5,7.413,7.0


In [114]:
df_noindex = pd.read_csv('no_index.csv')
df_noindex.head()

Unnamed: 0,Region,Happiness Rank,Happiness Score,int_score
0,Western Europe,1,7.526,8.0
1,Western Europe,2,7.509,8.0
2,Western Europe,3,7.501,8.0
3,Western Europe,4,7.498,7.0
4,Western Europe,5,7.413,7.0


# 常用统计方法
describe()：快速查看每列数据的统计信息，以下是可以输出的统计指标

count，数据个数（非空数据）

mean，均值

std，标准差

min，最小值

25%，第1四分位数，即第25百分位数

50%，第2四分位数，即第50百分位数

75%，第3四分位数，即第75百分位数

max，最大值

quantile(q)：

输出指定位置的百分位数，默认q=0.5，q的范围是[0,1]

In [150]:
data.describe()

Unnamed: 0,age,gender,surname
count,4.0,4.0,4.0
mean,31.25,99.0,99.0
std,5.909033,0.0,0.0
min,27.0,99.0,99.0
25%,28.5,99.0,99.0
50%,29.0,99.0,99.0
75%,31.75,99.0,99.0
max,40.0,99.0,99.0
