# DataFrame
### DataFrame的每一列是一个Series，或者说多个Series拼起来就是DataFrame

## 1. 创建dataframe

In [None]:
import numpy as np
import pandas as pd

# 1. 通过series创建
s1 = pd.Series([1, 2, 3, 4, 5])
s2 = pd.Series([6, 7, 8, 9, 10])
df = pd.DataFrame({'第1列': s1, '第2列': s2})
print(type(df))
print(type(df['第1列']))

# 2. 通过字典创建
df = pd.DataFrame(
    {
        'id': [1, 2, 3, 4, 5],
        'name': ['tom', 'jack', 'alice', 'bob', 'allen'],
        'age': [15, 17, 20, 26, 30],
        'score': [60.5, 80, 30.6, 70, 83.5]
    }, 
    index=[1, 2, 3, 4, 5], # 指定索引
    columns=['id', 'name', 'age', 'score'] # 指定列的顺序
)
print(df)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
   id   name  age  score
1   1    tom   15   60.5
2   2   jack   17   80.0
3   3  alice   20   30.6
4   4    bob   26   70.0
5   5  allen   30   83.5


## 2. 属性

In [None]:
print(df.index)
print(df.columns)
print(df.values) # 返回二维矩阵，每个元素就是横着的一条

print(df.ndim)
print(df.dtypes)
print(df.shape)
print(df.size)

print(df.T) # 上面转置index、columns、values也变了
print('-' * 50)

# 索引切片
# 1. 取行
print(df.loc[4])
print(df.iloc[3]) # 等价的
# 2. 取列
print(df.loc[:,'name'])
print(df.iloc[:, 1])
# 3. 切片同理
# 4. 单个数据
print(df.at[3, 'score'])
print(df.iat[2, 3]) # 等价的
# loc与at都可以用来取单个元素

Index([1, 2, 3, 4, 5], dtype='int64')
Index(['id', 'name', 'age', 'score'], dtype='object')
[[1 'tom' 15 60.5]
 [2 'jack' 17 80.0]
 [3 'alice' 20 30.6]
 [4 'bob' 26 70.0]
 [5 'allen' 30 83.5]]
2
id         int64
name      object
age        int64
score    float64
dtype: object
(5, 4)
20
          1     2      3     4      5
id        1     2      3     4      5
name    tom  jack  alice   bob  allen
age      15    17     20    26     30
score  60.5  80.0   30.6  70.0   83.5
--------------------------------------------------
id          4
name      bob
age        26
score    70.0
Name: 4, dtype: object
id          4
name      bob
age        26
score    70.0
Name: 4, dtype: object
1      tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object
1      tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object
30.6
30.6


## 3. 方法数据的方式

In [40]:
# 1. 获取单列数据
print(df['name']) # 获取单列数据，返回series类型的数据
print(type(df['name']))
# 或者直接.列名
print(df.name) # 好处：不用加[]，也不用加''

print(df[['name']]) # 获取单列数据但是得到单列的dataframe
print(type(df[['name']]))

# 2. 获取多列数据
print(df[['name', 'score']])

# 3. 查看部分数据
print(df.head(2))
print(df.tail(2))

# 4. bool索引筛选数据
print(df[(df.score>70) & (df.age<20)])
print(df[(df['score']>70) & (df['age']<20)]) # 不用属性而用索引

# 5. 随机取样
print(df.sample(3))

1      tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object
<class 'pandas.core.series.Series'>
1      tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object
    name
1    tom
2   jack
3  alice
4    bob
5  allen
<class 'pandas.core.frame.DataFrame'>
    name  score
1    tom   60.5
2   jack   80.0
3  alice   30.6
4    bob   70.0
5  allen   83.5
   id  name  age  score
1   1   tom   15   60.5
2   2  jack   17   80.0
   id   name  age  score
4   4    bob   26   70.0
5   5  allen   30   83.5
   id  name  age  score
2   2  jack   17   80.0
   id  name  age  score
2   2  jack   17   80.0
   id   name  age  score
2   2   jack   17   80.0
4   4    bob   26   70.0
5   5  allen   30   83.5


## 4. 常用方法

In [67]:
print(df.head(2))
print(df.tail(1))

print(df.isin(['jack', 20])) # 各自数据是否在列表里
print(df.isna())

print(df.sum()) # 这样的话，每列直接做+法，字符串就是拼接
print(df['score'].sum()) # 一般单独取一列，然后再求
print(df.score.max())
print(df.score.mean())
print(df.score.median())
print(df.score.mode()) # 回顾：没有重复就都输出
print(df.score.std())
print(df.score.var())
print(df.score.quantile(0.25))
print(df.describe()) # 每一列求，但只对是数字的列求，是字符串的列不求
print(df.count()) # 每一列非缺失值
print(df.value_counts()) # 对记录计数（.values返回多条记录的列表），完全重复才行
print(df.drop_duplicates()) # 去重

print(df.duplicated()) # 是否重复（返回bool）
print(df.duplicated(subset=['score'])) # 也可以查看子列是否重复
print(df.duplicated(['score', 'age'])) # 也行

print(df.sample(2))
print(df.replace(15, 30)) # 把15→30

print(df.cumsum()) # 字符串又+了
print(df.cummax()) # 累计最大值，到当前数据为止的最大值
# print(df.cummax(axis=1)) # 默认按照列，可以手动设置 但是都要是数字才行
print(df.sort_index(ascending=False))
print(df.sort_values(by=['score','age'], ascending=[True, False])) # 因为有多列，所以可以多列排序

print(df.nlargest(2, ['score', 'age'])) # 多列取
print(df.nsmallest(2, ['score', 'age']))

   id  name  age  score
1   1   tom   15   60.5
2   2  jack   17   80.0
   id   name  age  score
5   5  allen   30   83.5
      id   name    age  score
1  False  False  False  False
2  False   True  False  False
3  False  False   True  False
4  False  False  False  False
5  False  False  False  False
      id   name    age  score
1  False  False  False  False
2  False  False  False  False
3  False  False  False  False
4  False  False  False  False
5  False  False  False  False
id                         15
name     tomjackaliceboballen
age                       108
score                   324.6
dtype: object
324.6
83.5
64.92
70.0
0    30.6
1    60.5
2    70.0
3    80.0
4    83.5
Name: score, dtype: float64
21.188605428390044
448.957
60.5
             id        age      score
count  5.000000   5.000000   5.000000
mean   3.000000  21.600000  64.920000
std    1.581139   6.268971  21.188605
min    1.000000  15.000000  30.600000
25%    2.000000  17.000000  60.500000
50%    3.000000  20.0000