# pandas批量拆分与合并Excel
实例:
1. 将一个大excel等份拆分成多个excel
2. 将多个小excel合并成一个大excel并标记来源

In [1]:
work_dir = './data/合并目录/'
split_dir = './data/合并目录/splits/'

In [2]:
import os

if not os.path.exists(split_dir):
    os.mkdir(split_dir)

## 0. 读取exce

In [3]:
import pandas as pd

In [4]:
df_source = pd.read_excel('./data/movies/users.xlsx')

In [5]:
df_source.head()

Unnamed: 0,userId,age,gender,occupation,zip-code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [6]:
df_source.index

RangeIndex(start=0, stop=943, step=1)

In [7]:
df_source.shape  # 查看有几行几列

(943, 5)

# 一. 将一个大excel等份拆分成多个excel
1. 使用df.iloc方法, 将一个大DataFrame拆分成多个小DataFrame
2. 将使用dataFrame.to_excel()保存小excel


**1.计算拆分后的每个excel的行数**

In [8]:
# 将某一列做统计并且将统计结果转换为列表方式一
list(df_source['occupation'].value_counts().keys()[0:])

['student',
 'other',
 'educator',
 'administrator',
 'engineer',
 'programmer',
 'librarian',
 'writer',
 'executive',
 'scientist',
 'artist',
 'technician',
 'marketing',
 'entertainment',
 'healthcare',
 'retired',
 'lawyer',
 'salesman',
 'none',
 'homemaker',
 'doctor']

In [9]:
# 将某一列做统计并且将统计结果转换为列表方式二
list(df_source['occupation'].value_counts().index[0:])

['student',
 'other',
 'educator',
 'administrator',
 'engineer',
 'programmer',
 'librarian',
 'writer',
 'executive',
 'scientist',
 'artist',
 'technician',
 'marketing',
 'entertainment',
 'healthcare',
 'retired',
 'lawyer',
 'salesman',
 'none',
 'homemaker',
 'doctor']

In [10]:
# 这个大excel, 会拆封给这些人
user_names = ['xiao_shuai', 'xiao_wang', 'xiao_ming', 'xiao_lei', 'xiao_bo', 'xiao_hong']
# 每个人的任务数目
total_row_count = df_source.shape[0]
split_size = total_row_count / len(user_names)
if total_row_count % len(user_names) != 0:
    split_size += 1
int(split_size)

158

**2.拆分成多个DataFrame**

1. 使用循环来确定拆分数据

In [11]:
df_subs = []
for idx, user_name in enumerate(user_names):
    # iloc开始索引
    begin = int(idx * split_size)
    # iloc结束索引
    end = int(begin + split_size)
    # 实现df安装iloc拆分
    df_sub = df_source.iloc[begin: end]
    df_subs.append((idx, user_name, df_sub))

In [12]:
df_subs

[(0,
  'xiao_shuai',
       userId  age gender  occupation zip-code
  0         1   24      M  technician    85711
  1         2   53      F       other    94043
  2         3   23      M      writer    32067
  3         4   24      M  technician    43537
  4         5   33      F       other    15213
  ..      ...  ...    ...         ...      ...
  153     154   25      M     student    53703
  154     155   32      F       other    11217
  155     156   25      M    educator    08360
  156     157   57      M    engineer    70808
  157     158   50      M    educator    27606
  
  [158 rows x 5 columns]),
 (1,
  'xiao_wang',
       userId  age gender     occupation zip-code
  158     159   23      F        student    55346
  159     160   27      M     programmer    66215
  160     161   50      M         lawyer    55104
  161     162   25      M         artist    15610
  162     163   49      M  administrator    97212
  ..      ...  ...    ...            ...      ...
  311     312  

2. 使用pandas来代替循环确定拆分数据

In [13]:
def get_begin_end_ids(user_name):
    ids = user_names.index(user_name)
    begin = int(ids * split_size)
    end = int(begin + split_size)
    return begin, end, ids


m = list(map(get_begin_end_ids, user_names))
query_df = pd.DataFrame({'user_name': user_names,
                         'begin': list(map(lambda i: i[0], m)),
                         'end': list(map(lambda i: i[1], m)),
                         'ids': list(map(lambda i: i[1], m))})

In [14]:
query_df

Unnamed: 0,user_name,begin,end,ids
0,xiao_shuai,0,158,158
1,xiao_wang,158,316,316
2,xiao_ming,316,474,474
3,xiao_lei,474,632,632
4,xiao_bo,632,790,790
5,xiao_hong,790,948,948


In [15]:
% % time

def get_sub_df(user_name):
    condition = query_df['user_name'] == user_name
    condition_index = query_df.loc[condition].index[0]
    begin = query_df.loc[condition]['begin'][condition_index]
    end = query_df.loc[condition]['end'][condition_index]
    return df_source.loc[begin: end]

CPU times: total: 0 ns
Wall time: 0 ns


In [16]:
df_subs1 = list(map(get_sub_df, user_names))

**2.将拆分后的数据保存**

In [17]:
df_subs1

[     userId  age gender  occupation zip-code
 0         1   24      M  technician    85711
 1         2   53      F       other    94043
 2         3   23      M      writer    32067
 3         4   24      M  technician    43537
 4         5   33      F       other    15213
 ..      ...  ...    ...         ...      ...
 154     155   32      F       other    11217
 155     156   25      M    educator    08360
 156     157   57      M    engineer    70808
 157     158   50      M    educator    27606
 158     159   23      F     student    55346
 
 [159 rows x 5 columns],
      userId  age gender     occupation zip-code
 158     159   23      F        student    55346
 159     160   27      M     programmer    66215
 160     161   50      M         lawyer    55104
 161     162   25      M         artist    15610
 162     163   49      M  administrator    97212
 ..      ...  ...    ...            ...      ...
 312     313   41      M      marketing    60035
 313     314   20      F     

In [18]:
df_data = {'name': user_names, 'data': df_subs1}
d = pd.DataFrame(df_data)

In [19]:
d

Unnamed: 0,name,data
0,xiao_shuai,userId age gender occupation zip-code 0...
1,xiao_wang,userId age gender occupation zip-cod...
2,xiao_ming,userId age gender occupation zip-cod...
3,xiao_lei,userId age gender occupation zip-code 4...
4,xiao_bo,userId age gender occupation zip-cod...
5,xiao_hong,userId age gender occupation zip-cod...


In [20]:
d.apply(lambda s: s['data'].to_excel(split_dir + s['name'] + '.xlsx', index=False), axis=1)

0    None
1    None
2    None
3    None
4    None
5    None
dtype: object

# 二. 合并多个小excel到一个excel
1. 遍历文件夹， 的到所有的excel文件
2. 分别读取到DataFrame, 给每个df添加一列用于标记来源
3. 使用pd.concat进行批量合并
4. 将合并文件保存

**1.遍历文件夹， 的到所有的excel文件**

In [21]:
excel_source = os.listdir(split_dir)

def load_excel_to_data_frame(file_name:str):
    split_df = pd.read_excel(split_dir + file_name)
    print(file_name)
    split_df['user_name'] = file_name.split('.')[0]
    return split_df

df_list = list(map(load_excel_to_data_frame, excel_source))
df_list

xiao_bo.xlsx
xiao_hong.xlsx
xiao_lei.xlsx
xiao_ming.xlsx
xiao_shuai.xlsx
xiao_wang.xlsx


[     userId  age gender     occupation zip-code user_name
 0       633   35      M     programmer    55414   xiao_bo
 1       634   39      M       engineer    T8H1N   xiao_bo
 2       635   22      M          other    23237   xiao_bo
 3       636   47      M       educator    48043   xiao_bo
 4       637   30      M          other    74101   xiao_bo
 ..      ...  ...    ...            ...      ...       ...
 154     787   18      F        student    98620   xiao_bo
 155     788   51      M  administrator    05779   xiao_bo
 156     789   29      M          other    55420   xiao_bo
 157     790   27      M     technician    80913   xiao_bo
 158     791   31      M       educator    20064   xiao_bo
 
 [159 rows x 6 columns],
      userId  age gender     occupation zip-code  user_name
 0       791   31      M       educator    20064  xiao_hong
 1       792   40      M     programmer    12205  xiao_hong
 2       793   22      M        student    85281  xiao_hong
 3       794   32      M 

In [22]:
pd.concat(df_list).to_excel(work_dir + 'merge.xlsx', index=False)