# Excel报表

In [1]:
# 读取一月的交易记录
import pandas as pd
import numpy as np

# 展示内容设置为10行
pd.options.display.max_rows = 10
df = pd.read_excel('./data/sales_data/new/January.xlsx')
df.head()

Unnamed: 0,transaction_id,store,status,transaction_date,plan,contract_type,amount
0,abfbdd6d,Chicago,ACTIVE,2019-01-01,Silver,NEW,14.25
1,136a9997,San Francisco,ACTIVE,2019-01-01,Gold,NEW,19.35
2,c6688f32,San Francisco,ACTIVE,2019-01-01,Bronze,NEW,12.2
3,6ef349c1,Chicago,ACTIVE,2019-01-01,Gold,NEW,19.35
4,22066f29,San Francisco,ACTIVE,2019-01-01,Silver,NEW,14.25


In [14]:
from pathlib import Path

# 获取当前文件的目录
this_dir = Path(".").resolve()
# 从sales_data的所有子文件夹中读取Excel文件
parts = []
for path in (this_dir / "data/sales_data").rglob("*.xls*"):
    #print(f'Reading {path.name}')
    part = pd.read_excel(path, index_col="transaction_id")
    parts.append(part)
df = pd.concat(parts)
pivot = pd.pivot_table(df,
                       index='transaction_date',
                       columns="store",
                       values="amount",
                       aggfunc=sum)
# 按月采样，并赋予一个索引名称
summary = pivot.resample('M').sum()
summary.index.name = "Month"
summary.to_excel(this_dir / "sales_report_pandas.xlsx")

### read_excel函数和ExcelFile类

In [15]:
# 通过sheet_name、skiprows和usecols这些参数，告诉pandas关于我们想要读取的列的详细信息
df = pd.read_excel('./data/xl/stores.xls', sheet_name='2019', skiprows=1, usecols='B:F')
df.head()

Unnamed: 0,Store,Employees,Manager,Since,Flagship
0,New York,10,Sarah,2018-07-20,False
1,San Francisco,12,Neriah,2019-11-02,MISSING
2,Chicago,4,Katelin,2020-01-31,
3,Boston,5,Georgiana,2017-04-01,True
4,Washington DC,3,Evan,NaT,False


In [16]:
# 可以通过执行info方法了解生成的DataFrame的数据类型
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Store      6 non-null      object        
 1   Employees  6 non-null      int64         
 2   Manager    6 non-null      object        
 3   Since      5 non-null      datetime64[ns]
 4   Flagship   5 non-null      object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 368.0+ bytes


In [17]:
# Flagship应该是bool而不是object，要修正这一问题，可以编写一个转换函数
def fix_missing(x):
    return False if x in ["", "MISSING"] else x


df = pd.read_excel('./data/xl/stores.xls',
                   sheet_name='2019',
                   skiprows=1,
                   usecols='B:F',
                   converters={'Flagship': fix_missing})
df.head()

Unnamed: 0,Store,Employees,Manager,Since,Flagship
0,New York,10,Sarah,2018-07-20,False
1,San Francisco,12,Neriah,2019-11-02,False
2,Chicago,4,Katelin,2020-01-31,False
3,Boston,5,Georgiana,2017-04-01,True
4,Washington DC,3,Evan,NaT,False


In [18]:
# 要读入所有的工作表，sheet_name=None
sheets = pd.read_excel('./data/xl/stores.xls',
                       sheet_name=['2019', '2020'],
                       usecols=['Store', 'Employees'],
                       skiprows=1)
sheets['2020'].head()

Unnamed: 0,Store,Employees
0,New York,11
1,San Francisco,10
2,Chicago,5
3,Boston,4
4,Washington DC,7


In [20]:
# 如果源文件的列没有标题，则设置参数header=None，并通过names参数设置对应的列名。注意sheet_name也接受工作表切片
df = pd.read_excel('./data/xl/stores.xls',
                   header=None,
                   skiprows=2,
                   skipfooter=3,
                   sheet_name=0,  # 第一个工作表
                   usecols='B:C, F',
                   names=['Branch', 'Employee_Count', 'Is_Flagship'])
df.head()

Unnamed: 0,Branch,Employee_Count,Is_Flagship
0,New York,10,False
1,San Francisco,12,MISSING
2,Chicago,4,


In [21]:
# 为了处理NaN，可以把na_values和keep_default_na结合起来。
# 直将含有MISSING的单元格解释为NaN，除此之外什么也不做：
df = pd.read_excel('./data/xl/stores.xls',
                   sheet_name='2019',
                   skiprows=1,
                   usecols='B,C,F',
                   skipfooter=2,
                   na_values='MISSING',
                   keep_default_na=False)
df.head()

Unnamed: 0,Store,Employees,Flagship
0,New York,10,False
1,San Francisco,12,
2,Chicago,4,
3,Boston,5,True


In [22]:
#  使用ExcelFile类来读取Excel文件
with pd.ExcelFile('./data/xl/stores.xls') as f:
    df1 = pd.read_excel(f, sheet_name='2019', skiprows=1, usecols='B:F', nrows=2)
    df2 = pd.read_excel(f, sheet_name='2020', skiprows=1, usecols='B:F', nrows=2)

In [23]:
df1

Unnamed: 0,Store,Employees,Manager,Since,Flagship
0,New York,10,Sarah,2018-07-20,False
1,San Francisco,12,Neriah,2019-11-02,MISSING


In [24]:
df2

Unnamed: 0,Store,Employees,Manager,Since,Flagship
0,New York,11,Sarah,2018-07-20,False
1,San Francisco,10,Neriah,2019-11-02,True


In [25]:
# 也可以通过ExcelFile防伪所有工作表的名称
stores = pd.ExcelFile('./data/xl/stores.xls')
stores.sheet_names

['2019', '2020', '2019-2020']

In [38]:
# 还可以通过URL读取Excel文件
url = "https://raw.githubusercontent.com/fzumstein/python-for-excel/1st-edition/xl/stores.xls"
pd.read_excel(url, skiprows=1, usecols='B:F', nrows=2)

Unnamed: 0,Store,Employees,Manager,Since,Flagship
0,New York,10,Sarah,2018-07-20,False
1,San Francisco,12,Neriah,2019-11-02,MISSING


### to_excel方法和ExcelWriter类

In [39]:
# 将DataFrame导出为Excel
import numpy as np
from datetime import datetime

data = [[datetime(2022, 5, 1, 22, 42), 2.222, 1, True],
        [datetime(2022, 5, 14, 22, 43), np.nan, 2, False],
        [datetime(2022, 5, 14, 22, 44), np.inf, True]]
df = pd.DataFrame(data, columns=['Dates', 'Floats', 'Integers', 'Booleans'])
df.index.name = 'index'
df

Unnamed: 0_level_0,Dates,Floats,Integers,Booleans
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2022-05-01 22:42:00,2.222,1,True
1,2022-05-14 22:43:00,,2,False
2,2022-05-14 22:44:00,inf,True,


In [41]:
df.to_excel('write_with_pandas.xlsx',
            sheet_name='Output',
            startrow=1,
            startcol=1,
            index=True,
            header=True,
            na_rep='<NA>',
            inf_rep='<INF>')

In [42]:
# 使用ExcelWriter将DataFrame导出为Excel
with pd.ExcelWriter('written_with_pandas2.xlsx') as writer:
    df.to_excel(writer, sheet_name='sheet1', startrow=1, startcol=1)
    df.to_excel(writer, sheet_name='sheet1', startrow=10, startcol=1)
    df.to_excel(writer, sheet_name='sheet2')