<font size=17>Pandas 学习</font> 

# 数据与结构 Series

## Series
一个数列有<br>
name<br>index<br>value


In [1]:
# 引入pandas
import pandas as pd
import numpy as np

### 构建Series

#### 不同构建Series的方法

In [2]:
#list 构建
pd.Series(range(10,20))

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64

In [3]:
# np.array 构建
pd.Series(np.random.rand(5))

0    0.057185
1    0.811203
2    0.807556
3    0.011090
4    0.171594
dtype: float64

In [4]:
#字典构建
d = {'a' : 3,'b' :8, 'c' : 7}
pd.Series(d)

a    3
b    8
c    7
dtype: int64

#### 构建时候指定索引

In [5]:
pd.Series(np.random.rand(5), index = ['a','b','c','d','e'])

a    0.017655
b    0.458761
c    0.148998
d    0.099902
e    0.777339
dtype: float64

### 数据预览

In [6]:
# 创建一个object
ser_obj = pd.Series(np.random.rand(100))

#### head()
head(sth)<br>
tail(sth)
sth = 需要浏览的行数

In [7]:
ser_obj.head(10)

0    0.919971
1    0.443245
2    0.509363
3    0.684076
4    0.325076
5    0.931150
6    0.187366
7    0.302792
8    0.275224
9    0.369558
dtype: float64

In [8]:
ser_obj.tail(5)

95    0.574543
96    0.308492
97    0.631395
98    0.536278
99    0.138368
dtype: float64

#### 获取索引

In [9]:
ser_obj.index

RangeIndex(start=0, stop=100, step=1)

In [10]:
ser_obj.values

array([0.91997145, 0.44324535, 0.5093631 , 0.68407617, 0.32507564,
       0.93114951, 0.18736617, 0.3027922 , 0.2752244 , 0.36955825,
       0.96113547, 0.23033145, 0.15887868, 0.09584305, 0.94099179,
       0.98396632, 0.68718141, 0.58314912, 0.96877197, 0.36292943,
       0.16297761, 0.64168722, 0.51573958, 0.77501806, 0.51275546,
       0.90159063, 0.98369569, 0.93387071, 0.72452352, 0.73267772,
       0.52160817, 0.69118363, 0.70690393, 0.76314034, 0.28719912,
       0.16858007, 0.29856142, 0.241091  , 0.64272195, 0.51639193,
       0.67632998, 0.96921087, 0.26833232, 0.81917518, 0.18961408,
       0.7126533 , 0.2374855 , 0.36104166, 0.17227814, 0.71469034,
       0.09888176, 0.95726426, 0.68221487, 0.6683518 , 0.19530746,
       0.64359642, 0.85083467, 0.89648256, 0.45624511, 0.09178201,
       0.62971237, 0.83255155, 0.80113013, 0.44904946, 0.21481291,
       0.15440754, 0.73015927, 0.76450739, 0.07684082, 0.88185358,
       0.84047895, 0.25917198, 0.84954241, 0.67975332, 0.85369

#### name 属性

In [11]:
# 构建一个Series 有名字
ser_obj = pd.Series(np.random.rand(100),name='rand_num')

In [12]:
ser_obj.head()

0    0.674105
1    0.790460
2    0.688416
3    0.824570
4    0.680837
Name: rand_num, dtype: float64

In [13]:
# 给索引去一个名字
ser_obj.index.name = 'index'

In [14]:
ser_obj.head()

index
0    0.674105
1    0.790460
2    0.688416
3    0.824570
4    0.680837
Name: rand_num, dtype: float64

### 通过索引获得数据

#### 通过索引获得对应的value

In [15]:
# 创建一个Series
ser_obj2 = pd.Series(np.random.rand(5), index = ['a','b','c','d','e'])

In [16]:
# 通过索引获得对应的value
ser_obj2['b']

0.23048753367641073

In [17]:
ser_obj2.loc['b']

0.23048753367641073

#### Series 可以被看成定长有序的字典， 通过 in 来判断数据是否存在**

In [18]:
'a' in ser_obj2

True

#### iloc
**通过位置获得数据而不是索引**

In [19]:
ser_obj2[0]

0.8751581535645485

In [20]:
ser_obj.iloc[0]

0.6741046577229347

### 处理缺失的数据

In [21]:
countries = ['China','US','Japan',None]
pd.Series(countries)

0    China
1       US
2    Japan
3     None
dtype: object

In [22]:
numbers = [4,5,6,None]
pd.Series(numbers)

0    4.0
1    5.0
2    6.0
3    NaN
dtype: float64

# 数据与结构 DataFrame

类似于多维数组<br>
每列数据可以是不同的类型<br>
索引包含**行索引**和**列索引**


## DataFrame

In [23]:
# 引入pandas 和 numpy 包
import pandas as pd
import numpy as np

### 构建DataFrame
(1)通过numpy.array构建<br>
(2)通过python dictionary 构建

In [24]:
array = np.random.randn(5,4)

In [25]:
df_obj = pd.DataFrame(array)
df_obj

Unnamed: 0,0,1,2,3
0,-0.796777,-1.695386,0.662187,0.090593
1,-0.969008,0.035826,-1.56202,0.593529
2,-0.252309,0.123815,-1.309434,0.960504
3,-1.324124,-0.68371,-0.699364,0.107232
4,-0.064402,-1.222414,0.910014,-1.06411


In [26]:
# 通过dict
dict_data = {'a':1,
             'b': pd.Timestamp('20190101'),
             'c': pd.Series(1, index = list(range(4)),dtype ='float32'),
             'd':np.array([3]*4,dtype = 'int32'), # 3 重复了4次
             'e':['Python','Java','C++','C#'],
             'f':'something'
    
}

In [27]:
dict_data

{'a': 1,
 'b': Timestamp('2019-01-01 00:00:00'),
 'c': 0    1.0
 1    1.0
 2    1.0
 3    1.0
 dtype: float32,
 'd': array([3, 3, 3, 3]),
 'e': ['Python', 'Java', 'C++', 'C#'],
 'f': 'something'}

In [28]:
df_obj2 = pd.DataFrame(dict_data)
df_obj2

Unnamed: 0,a,b,c,d,e,f
0,1,2019-01-01,1.0,3,Python,something
1,1,2019-01-01,1.0,3,Java,something
2,1,2019-01-01,1.0,3,C++,something
3,1,2019-01-01,1.0,3,C#,something


**数据大小不一样，可以生成DataFrame**<br>
哪个数据小，会被扩展成为最大的<br>
**DataFrame 的数据类型，是按照列来形成的*

### 获得DataFrame的属性

#### 获得列名
与Series一样

In [29]:
df_obj2.columns

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')

#### 获得索引

In [30]:
df_obj2.index

Int64Index([0, 1, 2, 3], dtype='int64')

#### 获得值

##### 获得所有的值

In [31]:
df_obj2.values

array([[1, Timestamp('2019-01-01 00:00:00'), 1.0, 3, 'Python',
        'something'],
       [1, Timestamp('2019-01-01 00:00:00'), 1.0, 3, 'Java', 'something'],
       [1, Timestamp('2019-01-01 00:00:00'), 1.0, 3, 'C++', 'something'],
       [1, Timestamp('2019-01-01 00:00:00'), 1.0, 3, 'C#', 'something']],
      dtype=object)

##### 获得某一列的值

In [32]:
df_obj2['e']

0    Python
1      Java
2       C++
3        C#
Name: e, dtype: object

In [33]:
df_obj2.e #如果列名有空格或者其他字符，可能会返回错误

0    Python
1      Java
2       C++
3        C#
Name: e, dtype: object

In [34]:
type(df_obj2.e)

pandas.core.series.Series

#### head & tail

In [35]:
df_obj2.head(3)

Unnamed: 0,a,b,c,d,e,f
0,1,2019-01-01,1.0,3,Python,something
1,1,2019-01-01,1.0,3,Java,something
2,1,2019-01-01,1.0,3,C++,something


In [36]:
df_obj2.tail(3)

Unnamed: 0,a,b,c,d,e,f
1,1,2019-01-01,1.0,3,Java,something
2,1,2019-01-01,1.0,3,C++,something
3,1,2019-01-01,1.0,3,C#,something


### 修改数据

#### 增加数据列

In [37]:
df_obj2['g'] = range(4)

In [38]:
df_obj2

Unnamed: 0,a,b,c,d,e,f,g
0,1,2019-01-01,1.0,3,Python,something,0
1,1,2019-01-01,1.0,3,Java,something,1
2,1,2019-01-01,1.0,3,C++,something,2
3,1,2019-01-01,1.0,3,C#,something,3


#### 删除数据


##### drop 返回值是操作结果，原数据不会改变

In [39]:
df_obj2.drop(columns=['b','c']) 

Unnamed: 0,a,d,e,f,g
0,1,3,Python,something,0
1,1,3,Java,something,1
2,1,3,C++,something,2
3,1,3,C#,something,3


In [40]:
df_obj2

Unnamed: 0,a,b,c,d,e,f,g
0,1,2019-01-01,1.0,3,Python,something,0
1,1,2019-01-01,1.0,3,Java,something,1
2,1,2019-01-01,1.0,3,C++,something,2
3,1,2019-01-01,1.0,3,C#,something,3


##### del 对原数据进行修改

In [41]:
del df_obj2['a']
df_obj2

Unnamed: 0,b,c,d,e,f,g
0,2019-01-01,1.0,3,Python,something,0
1,2019-01-01,1.0,3,Java,something,1
2,2019-01-01,1.0,3,C++,something,2
3,2019-01-01,1.0,3,C#,something,3


# 数据与结构 Index

索引**不可被改变**<br>
<br>
***索引的种类***<br>(1)index<br>(2)int64index以整形作为index<br>(3)MultiIndex层级索引， 年份与月份<br>(4)DatatimeIndex 时间序列<br>

In [52]:
# 构建Series
ser_obj = pd.Series(range(10,20,2),index = ['a','b','c','d','e'])
ser_obj

a    10
b    12
c    14
d    16
e    18
dtype: int64

#### 查看index

In [53]:
ser_obj.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [54]:
ser_obj.index[2] #index 是无法单个修改

'c'

#### 重置索引
reset_index()

In [57]:
ser_obj.reset_index(drop = True)
# 返回修改数据，而非在原数据上修改

0    10
1    12
2    14
3    16
4    18
dtype: int64

In [59]:
ser_obj.reset_index(drop = False)
# drop = False 会保留index 这个column

Unnamed: 0,index,0
0,a,10
1,b,12
2,c,14
3,d,16
4,e,18


In [58]:
ser_obj

a    10
b    12
c    14
d    16
e    18
dtype: int64