# 创建`dataframe`

## 创建空的DataFrame

In [5]:
import pandas as pd
pd.DataFrame(columns=['numbers'])

Unnamed: 0,numbers


## 创建非空的DataFrame

In [6]:
import pandas as pd
import numpy as np

classes = ["101", "202", "303"]
df = pd.DataFrame(
    {
        "class":[classes[x % len(classes)] for x in np.random.randint(0, len(classes), 5)],
        "math":np.random.randint(0, 100, 5),
        "physics":np.random.randint(0, 100, 5)  
    })
df

Unnamed: 0,class,math,physics
0,101,48,40
1,101,75,96
2,303,67,3
3,202,44,5
4,303,68,24


## 指定`index`

In [7]:
import numpy as np
import pandas as pd

# 从列表创建，指定行、列索引
df = pd.DataFrame([10, 20, 30, 40],
                  columns=['numbers'],
                  index=['a', 'b', 'c', 'd'])
print(df)
print(df.columns[0])

   numbers
a       10
b       20
c       30
d       40
numbers


## 从array创建

In [8]:
import numpy as np
import pandas as pd

dates = pd.date_range('20231201', periods=6)
pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

Unnamed: 0,A,B,C,D
2023-12-01,-0.702564,0.41385,0.777789,1.425069
2023-12-02,-3.407933,1.362174,-0.565449,0.049359
2023-12-03,-0.273912,0.615177,0.696786,-0.746629
2023-12-04,1.538439,0.062885,-1.329661,1.183733
2023-12-05,-1.015053,0.056822,1.7001,-1.811353
2023-12-06,0.481599,-0.721072,-0.435208,-0.287081


## 从字典创建
使用字典创建时默认以key做为列索引，但有个要求，值不能是标量值（），比如下面这样会出错。这是因为字典的值作为列时需要根据它来生成DataFrame的index，如果你传入的是整数，那么无法生成index。

In [26]:
# 下面这样创建会出错
#pd.DataFrame(
#    {
#        'A':24492,
#        'B':24493
#    }
#)

# 指定index可以解决问题
pd.DataFrame(
    {
        'A':24492,
        'B':24493
    },
    index = [0]
)


Unnamed: 0,A,B
0,24492,24493


下面这样也没有问题：

In [28]:
pd.DataFrame(
    {
        'A':[24492],
        'B':24493
    }
)



Unnamed: 0,A,B
0,24492,24493


将key作为index

如果我要将字典的键作为index, 值作为column呢？那么传入`items()`这样就将key和value作为DataFrame的列，然后重设index就可以了。


In [31]:
my_dict = {'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}
#pd.DataFrame(my_dict)
pd.DataFrame(my_dict.items())
#df=pd.DataFrame(dict.items(), columns=['Case', 'First Fail Build'])
#df.set_index('Case', inplace=True)


Unnamed: 0,0,1
0,a,"[1, 2, 3]"
1,b,"[4, 5, 6]"
2,c,"[7, 8, 9]"


## 从列表创建

- 创建时可以用`index`指定行标签，`columns`来指定列标签，不指定默认为0。
- 当传入一个单维列表的，默认将该列表作为DataFrame中的第一列
- 当传入一个二维列表时(`np.random.standard_normal((9, 4)).round(6)`)，那么每行对应DataFrame中的一行。




In [34]:
import pandas as pd
students = ['Lisha', 'Shelly', 'Greay', 'Leo', 'Marry']
pd.DataFrame(students)

Unnamed: 0,0
0,Lisha
1,Shelly
2,Greay
3,Leo
4,Marry


In [35]:
import pandas as pd
data = {'Name':['Lisha', 'Shelly', 'Greay', 'Leo', 'Marry'],
        'Age':[18, 21, 29, 18, 23]}
pd.DataFrame(data, 
             columns=['Age', 'Name'],
             index=['one', 'two', 'three', 'four', 'five'])

Unnamed: 0,Age,Name
one,18,Lisha
two,21,Shelly
three,29,Greay
four,18,Leo
five,23,Marry


- 嵌套列表创建

这里的嵌套列表指的是两层列表，因为它刚好是一个二维数组的结构。所以在创建时，每个外层列表元素都作为单独的一行。

参考：

- [Nested List to Pandas Dataframe with headers](https://stackoverflow.com/questions/32857544/nested-list-to-pandas-dataframe-with-headers)

In [37]:
l = [['a', 'b', 'c', 'd'],
     [1, 2, 3, 4],
     ['i', 'j', 'k', 'l']]
pd.DataFrame(l)

Unnamed: 0,0,1,2,3
0,a,b,c,d
1,1,2,3,4
2,i,j,k,l


## 从元组创建

元组和列表的形式是类似的，只不过元组无法修改元素但是列表可以。

In [38]:
import pandas as pd

data = [('Facebook', 750, True),
        ('Alphabet', 1100, True),
        ('Amazon', 1700, True),
        ('Apple', 2100, False),
        ('Microsoft', 1750, False)]

pd.DataFrame(data, columns=['Name', 'M-cap', 'Internet Companies'])


Unnamed: 0,Name,M-cap,Internet Companies
0,Facebook,750,True
1,Alphabet,1100,True
2,Amazon,1700,True
3,Apple,2100,False
4,Microsoft,1750,False


# 添加新行

两种方法：`df.loc[i] = `和`append`

参考：

- [Create a Pandas Dataframe by appending one row](https://stackoverflow.com/questions/10715965/create-a-pandas-dataframe-by-appending-one-row-at-a-time)

In [10]:
import numpy as np
import pandas as pd

dates = pd.date_range('20231201', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)

df.loc[len(df)] = [50, 5.5, 'loc', 'loc']
print(df)


                   A         B         C         D
2023-12-01  1.329725  0.667079 -2.328407  0.230727
2023-12-02  0.668376  0.611502 -0.136222  0.445717
2023-12-03  0.943432 -1.369053  0.291451  2.090460
2023-12-04  0.626290  0.485262 -0.872164 -0.723104
2023-12-05  2.247605  1.130618  0.248310 -0.663663
2023-12-06  0.059392 -0.159276 -0.268303  1.888678
                             A         B         C         D
2023-12-01 00:00:00   1.329725  0.667079 -2.328407  0.230727
2023-12-02 00:00:00   0.668376  0.611502 -0.136222  0.445717
2023-12-03 00:00:00   0.943432 -1.369053  0.291451   2.09046
2023-12-04 00:00:00   0.626290  0.485262 -0.872164 -0.723104
2023-12-05 00:00:00   2.247605  1.130618   0.24831 -0.663663
2023-12-06 00:00:00   0.059392 -0.159276 -0.268303  1.888678
6                    50.000000  5.500000       loc       loc


## 添加新列

In [None]:
df['names2'] = pd.DataFrame(['Yv', 'Gu', 'Fe', 'Fr'],
                            index=['d', 'a', 'b', 'c'])
print(df)


   numbers names2
a       10     Gu
b       20     Fe
c       30     Fr
d       40     Yv
4       89    NaN


# 转换为dict

- `orient="records"`

In [3]:
df.to_dict(orient="records")

[{'class': '101', 'math': 93, 'physics': 32},
 {'class': '303', 'math': 84, 'physics': 81},
 {'class': '202', 'math': 40, 'physics': 84},
 {'class': '202', 'math': 89, 'physics': 51},
 {'class': '303', 'math': 4, 'physics': 42},
 {'class': '303', 'math': 38, 'physics': 33},
 {'class': '101', 'math': 63, 'physics': 62},
 {'class': '303', 'math': 4, 'physics': 70},
 {'class': '101', 'math': 55, 'physics': 50},
 {'class': '303', 'math': 54, 'physics': 94},
 {'class': '303', 'math': 45, 'physics': 7},
 {'class': '303', 'math': 82, 'physics': 83},
 {'class': '303', 'math': 26, 'physics': 51},
 {'class': '202', 'math': 1, 'physics': 81},
 {'class': '101', 'math': 4, 'physics': 94}]

# 查看数据

- `df.dtypes`
- `df.info()`
- `df.describe()`

In [6]:
print("--------df.info()--------")
print(df.info())
print("--------df.describe()--------")
print(df.describe(include='int32'))

--------df.info()--------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   class    15 non-null     object
 1   math     15 non-null     int32 
 2   physics  15 non-null     int32 
dtypes: int32(2), object(1)
memory usage: 372.0+ bytes
None
--------df.describe()--------
            math    physics
count  15.000000  15.000000
mean   43.133333  39.000000
std    31.475312  29.014775
min     5.000000   2.000000
25%    11.500000  22.000000
50%    45.000000  33.000000
75%    72.500000  50.500000
max    98.000000  99.000000


## 鸟瞰数据

- df.sample()
- df.head()
- df.tail()

In [9]:
df.sample(6)

Unnamed: 0,class,math,physics
1,101,98,99
13,202,38,85
3,202,54,3
8,101,45,44
6,202,47,33
9,202,14,45


## 取消科学计数法

In [13]:
import pandas as pd
import numpy as np

df = pd.DataFrame(
    {
        'A' : np.random.standard_normal(5),
        'B' : np.random.standard_normal(5),
        'C' : np.random.randint(0, 100, 5),
    }
)
#pd.set_option('display.float_format', lambda x: '%.2f' % x)
#pd.set_option('display.float_format', "{:.2f}".format)
pd.set_option('display.precision', 2)

print(df)

      A     B   C
0 -0.48 -0.23  14
1  1.13 -1.28  37
2  0.16 -1.59   4
3 -1.62 -0.23  47
4  0.43 -0.60  77
