In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pandas import Series, DataFrame

# `1)Series`

### `1.1)Series的创建`

`Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False) `

`One-dimensional ndarray with axis labels (including time series).`

In [7]:
data = np.array(list(range(10)))
# 由ndarray创建的series是引用而不是副本，对series元素的改变也会改变ndarray的元素
# series存放的数据必须是1维的
data_series = Series(data, index=(list("pythonjava")))
display(data_series)

p    0
y    1
t    2
h    3
o    4
n    5
j    6
a    7
v    8
a    9
dtype: int64

### `1.2)Series的索引与切片`
#### `1.2.1)显式索引`

In [12]:
print(data_series.loc['y'])

1


#### `1.2.2)隐式索引`

In [15]:
print(data_series.iloc[1])

1


#### `1.2.3)切片`

In [18]:
result_series = data_series.loc['p':'n']
print(result_series)
result_series = data_series.iloc[0:6]
print(result_series)

p    0
y    1
t    2
h    3
o    4
n    5
dtype: int64
p    0
y    1
t    2
h    3
o    4
n    5
dtype: int64


### `1.3)Series的基本概念`

In [20]:
print(data_series.shape, data_series.size, data_series.index, data_series.values)

(10,) 10 Index(['p', 'y', 't', 'h', 'o', 'n', 'j', 'a', 'v', 'a'], dtype='object') [0 1 2 3 4 5 6 7 8 9]


#### `1.3.1)data_series.head(n=5)`

`Return the first n rows.`

In [21]:
print(data_series.head())

p    0
y    1
t    2
h    3
o    4
dtype: int64


#### `1.3.2)data_series.tail(n=5)`

`Return the last n rows.`

In [22]:
print(data_series.tail())

n    5
j    6
a    7
v    8
a    9
dtype: int64


#### `1.3.3)数据缺失`

In [28]:
data_series = Series([1, 23, None, np.nan], index=(list("java")))
print(data_series)
print(type(None), type(np.nan))

j     1.0
a    23.0
v     NaN
a     NaN
dtype: float64
<class 'NoneType'> <class 'float'>


`data_series.isnull()`

`Return a boolean same-sized object indicating if the values are NA.`

In [32]:
isnull_series = data_series.isnull()
print(isnull_series)

j    False
a    False
v     True
a     True
dtype: bool


`data_series.notnull()`

`Return a boolean same-sized object indicating if the values are not NA.`

In [34]:
notnull_series = data_series.notnull()
print(notnull_series)
# 剔除空数据
result_series = data_series[notnull_series]
print(result_series)

j     True
a     True
v    False
a    False
dtype: bool
j     1.0
a    23.0
dtype: float64


#### `1.3.4)name`

In [35]:
data_series.name = 'java'
print(data_series.name)

java


### `1.4)Series的运算`

In [40]:
print(data_series)
# fill_value: Fill missing (NaN) values with this value.
print(data_series.add(10, fill_value=0))

j     1.0
a    23.0
v     NaN
a     NaN
Name: java, dtype: float64
j    11.0
a    33.0
v    10.0
a    10.0
Name: java, dtype: float64


In [45]:
s1 = Series([5, 6, 7, 8, 9], index=[0, 1, 2, 3, 4])
s2 = Series([5, 6, 7, 8, 9], index=[2, 3, 4, 5, 6])

In [47]:
# 索引相同的进行相加，索引不同的置为NaN
print(s1 + s2)

0     NaN
1     NaN
2    12.0
3    14.0
4    16.0
5     NaN
6     NaN
dtype: float64


In [49]:
# 使用add方法保留索引不同的值
print(s1.add(s2, fill_value=0))

0     5.0
1     6.0
2    12.0
3    14.0
4    16.0
5     8.0
6     9.0
dtype: float64


# `2)DataFrame`

### `2.1)DataFrame的创建`
`DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)`
  
`Two-dimensional size-mutable, potentially heterogeneous tabular data`

In [53]:
df = DataFrame({'height':[170, 175, 168, 172], 'age':np.random.randint(18, 25, size=4), 'sex':['f', 'm', 'm', 'f']}
               , index=list('ABCD'), columns=['age', 'sex', 'height'])
print(df)

   age sex  height
A   24   f     170
B   23   m     175
C   20   m     168
D   19   f     172


### `2.2)DataFrame的基本信息`

In [55]:
print(df.shape, df.index, df.columns, df.values, sep='\n')

(4, 3)
Index(['A', 'B', 'C', 'D'], dtype='object')
Index(['age', 'sex', 'height'], dtype='object')
[[24 'f' 170]
 [23 'm' 175]
 [20 'm' 168]
 [19 'f' 172]]


### `2.3)DataFrame的索引`
#### `2.3.1)列索引`

In [78]:
df_age = df['age']
print(df_age, type(df_age), sep='\n')
# 列是属性
df_age = df.age
print(df_age, type(df_age), sep='\n')

A    24
B    23
C    20
D    19
Name: age, dtype: int64
<class 'pandas.core.series.Series'>
A    24
B    23
C    20
D    19
Name: age, dtype: int64
<class 'pandas.core.series.Series'>


#### `2.3.2)行索引`

In [71]:
print(df.loc['A'], type(df.loc['A']), sep='\n')

age        24
sex         f
height    170
Name: A, dtype: object
<class 'pandas.core.series.Series'>


In [73]:
print(df.loc[['A']], type(df.loc[['A']]), sep='\n')

   age sex  height
A   24   f     170
<class 'pandas.core.frame.DataFrame'>


In [84]:
print(df.loc[['A', 'C']], type(df.loc[['A', 'C']]), sep='\n')

   age sex  height
A   24   f     170
C   20   m     168
<class 'pandas.core.frame.DataFrame'>


In [80]:
# 只有行切片，没有列切片
# 左闭右闭
print(df.loc['A':'C'])

   age sex  height
A   24   f     170
B   23   m     175
C   20   m     168


In [83]:
# 左闭右开
print(df.iloc[0:2])

   age sex  height
A   24   f     170
B   23   m     175


In [90]:
print(df['height']['C'])

168


In [85]:
print(df.loc['C']['height'])

168


### `2.4)DataFrame的运算`
#### `2.4.1)DaraFrame之间的运算`

In [91]:
score_1 = DataFrame(np.random.randint(60,150, size=(4, 4)), index=['张三', '李四', '王五', '赵六']
                    , columns=['语文', '数学', '英语', 'python'])
print(score_1)

     语文   数学   英语  python
张三  139  123  111     129
李四  118   89   80     112
王五  102  126   67     128
赵六  146  111   95      65


In [97]:
score_2 = DataFrame(np.random.randint(60,150, size=(5, 3)), index=['张三', '李四', '王五', '赵六', '钱七']
                    , columns=['数学', '英语', 'python'])
print(score_2)

     数学  英语  python
张三   77  91     138
李四  136  77      75
王五  126  74     123
赵六   97  68     126
钱七   89  72      79


In [98]:
print(score_1 + score_2)

    python     数学     英语  语文
张三   267.0  200.0  202.0 NaN
李四   187.0  225.0  157.0 NaN
王五   251.0  252.0  141.0 NaN
赵六   191.0  208.0  163.0 NaN
钱七     NaN    NaN    NaN NaN


In [99]:
print(score_1.add(score_2, fill_value=0))

    python     数学     英语     语文
张三   267.0  200.0  202.0  139.0
李四   187.0  225.0  157.0  118.0
王五   251.0  252.0  141.0  102.0
赵六   191.0  208.0  163.0  146.0
钱七    79.0   89.0   72.0    NaN


#### `2.4.2)Series与DaraFrame之间的运算`

In [104]:
score_1 = DataFrame(np.random.randint(60,150, size=(4, 4)), index=['张三', '李四', '王五', '赵六']
                    , columns=['语文', '数学', '英语', 'python'])
print(score_1)

    语文   数学   英语  python
张三  82  148  114     107
李四  72  128  125      65
王五  87  144  120      62
赵六  77   84  107      85


In [105]:
score_2= DataFrame(np.random.randint(60,150, size=(4, 4)), index=['张三', '李四', '王五', '赵六']
                    , columns=['语文', '数学', '英语', 'python'])
print(score_2)

     语文   数学   英语  python
张三  140  106  104     149
李四  103   70  127     121
王五   72   65   93      72
赵六  137  115  128      61


In [112]:
zhangsan_1 = score_1.loc['张三']
print(zhangsan_1)
# 对所有列生效
print(score_2.add(zhangsan_1, axis='columns'))

语文         82
数学        148
英语        114
python    107
Name: 张三, dtype: int64
     语文   数学   英语  python
张三  222  254  218     256
李四  185  218  241     228
王五  154  213  207     179
赵六  219  263  242     168


# `3)利用pandas进行数据清洗`

In [130]:
score = DataFrame({'age':[13, 24, 32, 45], 'height':[170, 168, 172, 167]}, 
                  index=list("ABCD"), columns=['age', 'height', 'sex'])
print(score)

   age  height  sex
A   13     170  NaN
B   24     168  NaN
C   32     172  NaN
D   45     167  NaN


In [132]:
score.sex['B':'C'] = 'f'

   age  height  sex
A   13     170  NaN
B   24     168    f
C   32     172    f
D   45     167  NaN


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [133]:
print(score)

   age  height  sex
A   13     170  NaN
B   24     168    f
C   32     172    f
D   45     167  NaN


#### `3.1)返回含有空数据的行`

In [141]:
score_isnull = score.isnull().any(axis=1)

In [142]:
print(score[score_isnull])

   age  height  sex
A   13     170  NaN
D   45     167  NaN


#### `3.2)返回非空数据的行`

In [148]:
score_notnull = score.notnull().all(axis=1)
print(score_notnull)

A    False
B     True
C     True
D    False
dtype: bool


In [149]:
print(score[score_notnull])

   age  height sex
B   24     168   f
C   32     172   f


#### `3.3)过滤空数据`

In [150]:
print(score)

   age  height  sex
A   13     170  NaN
B   24     168    f
C   32     172    f
D   45     167  NaN


In [151]:
print(score.dropna())

   age  height sex
B   24     168   f
C   32     172   f


In [152]:
print(score.dropna(how='all'))

   age  height  sex
A   13     170  NaN
B   24     168    f
C   32     172    f
D   45     167  NaN


#### `3.4)填充空数据`
`score.fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None, **kwargs)`

`Fill NA/NaN values using the specified method`

In [153]:
print(score.fillna(value=0))

   age  height sex
A   13     170   0
B   24     168   f
C   32     172   f
D   45     167   0


In [156]:
print(score.fillna(method='backfill'))

   age  height  sex
A   13     170    f
B   24     168    f
C   32     172    f
D   45     167  NaN


In [159]:
print(score.fillna(method='ffill', axis=1))

  age height  sex
A  13    170  170
B  24    168    f
C  32    172    f
D  45    167  167


In [163]:
score.fillna(method='backfill', inplace=True)
print(score)

   age  height  sex
A   13     170    f
B   24     168    f
C   32     172    f
D   45     167  NaN


In [164]:
score.fillna(method='ffill', inplace=True)
print(score)

   age  height sex
A   13     170   f
B   24     168   f
C   32     172   f
D   45     167   f


# `4)pandas多层索引`

### `4.1)多层行索引`

In [3]:
df1 = DataFrame(data=np.random.randint(60, 150, size=(6, 3)), 
               index=pd.MultiIndex.from_product([['A', 'B', 'C'], ['期中', '期末']]), 
               columns=['pytohn', 'java', 'database'])
display(df1)

Unnamed: 0,Unnamed: 1,pytohn,java,database
A,期中,97,148,84
A,期末,73,110,146
B,期中,61,105,129
B,期末,148,145,60
C,期中,64,89,146
C,期末,67,108,68


### `4.2)多层列索引`

In [6]:
df2 = DataFrame(data=np.random.randint(60, 150, size=(3, 6)), 
               index=['A', 'B', 'C'], 
               columns=pd.MultiIndex.from_product([['pytohn', 'java', 'database'], ['期中', '期末']]))
display(df2)

Unnamed: 0_level_0,pytohn,pytohn,java,java,database,database
Unnamed: 0_level_1,期中,期末,期中,期末,期中,期末
A,138,111,121,75,77,68
B,90,121,99,117,102,87
C,79,108,87,86,93,141


# `5)pandas数据合并`

In [2]:
# 定义dataframe生成函数
def make_df(index, columns):
    data = {c:[c + str(i) for i in index] for c in columns}
    return DataFrame(data, index=index, columns=columns)

In [3]:
make_df([1, 2], ['A', 'B'])

Unnamed: 0,A,B
1,A1,B1
2,A2,B2


### `5.1)pd.concat`
`pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)`

`Concatenate pandas objects along a particular axis with optional set logic`

In [4]:
df1 = make_df([1, 2], ['A', 'B'])
df2 = make_df([3, 4], ['A', 'B'])
pd.concat((df1, df2), axis=0)

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


In [5]:
# index可重复
df1 = make_df([1, 2], ['A', 'B'])
df2 = make_df([2, 3], ['A', 'B'])
pd.concat((df1, df2), axis=0)

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
2,A2,B2
3,A3,B3


### `5.2)忽略索引、重新排列索引`

In [7]:
pd.concat((df1, df2), ignore_index=True)

Unnamed: 0,A,B
0,A1,B1
1,A2,B2
2,A2,B2
3,A3,B3


### `5.3)使用多层索引`

In [12]:
df1 = make_df(['a', 'b'], ['X', 'Y'])
df2 = make_df(['A', 'B'], ['X', 'Y'])
# keys使合并后的数据更加清晰
pd.concat((df1, df2), keys=['small', 'big'], axis=0)

Unnamed: 0,Unnamed: 1,X,Y
small,a,Xa,Ya
small,b,Xb,Yb
big,A,XA,YA
big,B,XB,YB
