# Load the data

In [171]:
import pandas as pd
import re
data=pd.read_excel("C:\\Users\\win10\\Desktop\\宝哥数据\\CCP.xlsx")

# Replace Chinese numbers with Arabic numerals

In [172]:
# 主要针对年份便于正则匹配,十及十以上的不正确也不影响,同时去掉空白
repl_rule = {'零':0,'一':1,'二':2,'三':3,'四':4,'五':5,
             '六':6,'七':7,'八':8,'九':9,'十':0,'\s':''}

def repl_num(s,rule=repl_rule):
    sub = lambda s,kv:re.sub(kv[0],str(kv[1]),s)
    from functools import reduce
    new_s = reduce(sub,[s]+[*rule.items()])
    return new_s

data['Resume'] = pd.Series(map(repl_num,data['Resume']))

# Whether to hold the league position

In [173]:
# 编写正则表达式

#1. xxxx年至今,含有的[现任]信息优先于[任/历任/先后任]
regex1=r'\d{4}年.*至今.*[^曾]任.*(?<!集|兵|\d)团(?!长).*(?<!秘)[书记|委员|科长|经理|主任]{1,2}.*' 

#2. '[任]...至今'优先级等价于[现任]
regex2=r'(?<!至|\-|—)\d{4}年.*[^曾]任.*(?<!集|兵|\d)团(?!长).*(?<!秘)[书记|委员|科长|经理|主任]{1,2}.*[^年月日][至今|。|\.]{2}$' 

#3. xxxx年起,含有的[现任]信息优先于[任/历任/先后任]
regex3=r'(?<!至|\-|—)\d{4}年.*[^至\-—].*起.*(?<!曾|历|后)任.*(?<!集|兵|\d)团(?!长).*(?<!秘)[书记|委员|科长|经理|主任]{1,2}.*'   

#4. '[至/——]报告期末'优先级等价于[现任]
regex4=r'(\d{4}.{1,6})?[至\-—]?报告期末.*[^曾]任.*(?<!集|兵|\d)团(?!长).*(?<!秘)[书记|委员|科长|经理|主任]{1,2}' 

#5. 现任
regex5=r'现任.*(?<!集|兵|\d)团(?!长).*(?<!秘)[书记|委员|科长|经理|主任]{1,2}.*'
#6. 排除匹配到[现任]'不相关职务',历任'相关职务'的特例
regex5a=r'现任.*(?<!集|兵|\d)团(?!长).*(?<!秘)[书记|委员|科长|经理|主任]{1,2}.*(?=历任|曾任|先后任)'

# 编译正则表达式
rec1 = re.compile(regex1)
rec2 = re.compile(regex2)
rec3 = re.compile(regex3)
rec4 = re.compile(regex4)
rec5 = re.compile(regex5)
rec5a = re.compile(regex5a)

# 根据正则表达式提取信息
def extract_info(df,regex):
    info = []
    for i in df['Resume']:
        j = re.split('。|；|;|\.',i)
        list_info = []
        for ii in j:
            regx = regex.search(ii)
            if regx:
                if regex==rec2 or regex==rec4:
                    list_info.append(regx.group())
                elif regex==rec1 or regex==rec3:
                    search1 = re.search('现任',regx.group())
                    if search1:
                        search2=rec5.search(regx.group())
                        if search2:
                            search3=re.search('历任|曾任|先后任',search2.group())
                            if search3:
                                search4=rec5a.search(search2.group())
                                if search4:
                                    list_info.append(search4.group())
                            else:
                                list_info.append(search2.group())
                    else:
                        list_info.append(regx.group())     
                elif regex==rec5:
                    search5 = re.search('历任|曾任|先后任',regx.group())
                    if search5:
                        search5a = rec5a.search(regx.group())
                        if search5a:
                            list_info.append(search5a.group())
                    else:
                        list_info.append(regx.group())
        if list_info==[]:
            info.append('')
        else:
            info.append('。'.join(list_info))
    info_series = pd.Series(info,index=df.index)
    return info_series

def league_data(df):    
    s1 = extract_info(df,rec1)
    s2 = extract_info(df,rec2)
    s3 = extract_info(df,rec3)
    s4 = extract_info(df,rec4)
    s5 = extract_info(df,rec5)
    info_s = s1+s2+s3+s4+s5
    info_s.name = 'league_info'
    df = pd.concat([df,info_s],axis=1)
    df['CCYL'] = 1
    df['CCYL'][df['league_info']==''] = 0
    return df

league_df = league_data(data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [174]:
print(league_df['CCYL'].value_counts())
league_df[league_df['CCYL']==1].to_excel("C:\\Users\\win10\\Desktop\\宝哥数据\\league_info.xlsx")

0    142190
1       220
Name: CCYL, dtype: int64


# Whether to hold the party positon

In [175]:
# 编写正则表达式

# 1. xxxx年至今,含有的[现任]信息优先于[任/历任/先后任]
regex1=r'\d{4}年.*至今.*[^曾]任.*((?:党)|(?:党群)|(?:政协)|(?:人大)).*(?<!秘)[书记|委员|科长|经理|主任]{1,2}.*'

# 2. '[任]...至今'优先级等价于[现任]
regex2=r'(?<!至|\-|—)\d{4}年.*[^曾]任.*((?:党)|(?:党群)|(?:政协)|(?:人大)).*(?<!秘)[书记|委员|科长|经理|主任]{1,2}.*[^年月日][至今|。|\.]{2}$'  

# 3. xxxx年起,含有的[现任]信息优先于[任/历任/先后任]
regex3=r'(?<!至|\-|—)\d{4}年.*[^至\-—].*起.*(?<!曾|历|后)任.*((?:党)|(?:党群)|(?:政协)|(?:人大)).*(?<!秘)[书记|委员|科长|经理|主任]{1,2}.*'  

# 4. '[至/——]报告期末'优先级等价于[现任]
regex4=r'(\d{4}.{1,6})?[至\-—]?报告期末.*[^曾]任.*((?:党)|(?:党群)|(?:政协)|(?:人大)).*(?<!秘)[书记|委员|科长|经理|主任]{1,2}'  

# 5. 现任
regex5=r'现任.*((?:党)|(?:党群)|(?:政协)|(?:人大)).*(?<!秘)[书记|委员|科长|经理|主任]{1,2}.*'
# 6. 排除匹配到[现任]'不相关职务',历任'相关职务'的特例
regex5a=r'现任.*((?:党)|(?:党群)|(?:政协)|(?:人大)).*(?<!秘)[书记|委员|科长|经理|主任]{1,2}.*(?=历任|曾任|先后任)' 

# 编译正则表达式
rec1 = re.compile(regex1)
rec2 = re.compile(regex2)
rec3 = re.compile(regex3)
rec4 = re.compile(regex4)
rec5 = re.compile(regex5)
rec5a = re.compile(regex5a)

#调用用 extract_info()提取信息
def party_data(df):
    s1 = extract_info(df,rec1)
    s2 = extract_info(df,rec2)
    s3 = extract_info(df,rec3)
    s4 = extract_info(df,rec4)
    s5 = extract_info(df,rec5)
    info_s = s1+s2+s3+s4+s5
    info_s.name = 'party_info'
    df = pd.concat([df,info_s],axis=1)
    df['Party'] = 1
    df['Party'][df['party_info']==''] = 0
    return df

party_df = party_data(league_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [176]:
print(party_df['Party'].value_counts())
party_df[party_df['Party']==1].to_excel("C:\\Users\\win10\\Desktop\\宝哥数据\\party_info.xlsx")

0    96094
1    46316
Name: Party, dtype: int64


# Whether to serve as a director

In [177]:
# 编写正则表达式

# 1. xxxx年至今,含有的[现任]信息优先于[任/历任/先后任]
regex1=r'\d{4}年.*至今.*[^曾]任.*董事.*'

# 2. '[任]...至今'优先级等价于[现任]
regex2=r'(?<!至|\-|—)\d{4}年.*[^曾]任.*董事.*[至今|。|\.]{2}$'  

# 3. xxxx年起,含有的[现任]信息优先于[任/历任/先后任]
regex3=r'(?<!至|\-|—)\d{4}年.*[^至\-—].*起.*(?<!曾|历|后)任.*董事.*'  

# 4. '[至/——]报告期末'优先级等价于[现任]
regex4=r'(\d{4}.{1,6})?[至\-—]?报告期末.*[^曾]任.*董事.{,5}'  

# 5. 现任
regex5=r'现任.*董事.*'
# 6. 排除匹配到[现任]'不相关职务',历任'相关职务'的特例
regex5a=r'现任.*董事.*(?=历任|曾任|先后任)' 

# 编译正则表达式
rec1 = re.compile(regex1)
rec2 = re.compile(regex2)
rec3 = re.compile(regex3)
rec4 = re.compile(regex4)
rec5 = re.compile(regex5)
rec5a = re.compile(regex5a)

# 在有党职的前提下提取董事信息
def party_dir(df):
    df_party = df[df['Party']==1]
    s1 = extract_info(df_party,rec1)
    s2 = extract_info(df_party,rec2)
    s3 = extract_info(df_party,rec3)
    s4 = extract_info(df_party,rec4)
    s5 = extract_info(df_party,rec5)
    info_s = s1+s2+s3+s4+s5
    info_s.name = 'dir_info'
    df_party = pd.concat([df_party,info_s],axis=1)
    df_party['Party_Dir'] = 1
    df_party['Party_Dir'][df_party['dir_info']==''] = 0
    return df_party

dir_df = party_dir(party_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [178]:
print(dir_df['Party_Dir'].value_counts())
dir_df[dir_df['Party_Dir']==1].to_excel("C:\\Users\\win10\\Desktop\\宝哥数据\\dir_info.xlsx")

1    27775
0    18541
Name: Party_Dir, dtype: int64


# Whether to serve as supervisor

In [179]:
# 编写正则表达式

# 1. xxxx年至今,含有的[现任]信息优先于[任/历任/先后任]
regex1=r'\d{4}年.*至今.*[^曾]任.*监事.*'

# 2. '[任]...至今'优先级等价于[现任]
regex2=r'(?<!至|\-|—)\d{4}年.*[^曾]任.*监事.*[至今|。|\.]{2}$'  

# 3. xxxx年起,含有的[现任]信息优先于[任/历任/先后任]
regex3=r'(?<!至|\-|—)\d{4}年.*[^至\-—].*起.*(?<!曾|历|后)任.*监事.*'  

# 4. '[至/——]报告期末'优先级等价于[现任]
regex4=r'(\d{4}.{1,6})?[至\-—]?报告期末.*[^曾]任.*监事.{,5}'  

# 5. 现任
regex5=r'现任.*监事.*'
# 6. 排除匹配到[现任]'不相关职务',历任'相关职务'的特例
regex5a=r'现任.*监事.*(?=历任|曾任|先后任)' 

# 编译正则表达式
rec1 = re.compile(regex1)
rec2 = re.compile(regex2)
rec3 = re.compile(regex3)
rec4 = re.compile(regex4)
rec5 = re.compile(regex5)
rec5a = re.compile(regex5a)

# 在有党职的前提下提取监事信息
def party_sup(df):
    df_party = df[df['Party']==1]
    s1 = extract_info(df_party,rec1)
    s2 = extract_info(df_party,rec2)
    s3 = extract_info(df_party,rec3)
    s4 = extract_info(df_party,rec4)
    s5 = extract_info(df_party,rec5)
    info_s = s1+s2+s3+s4+s5
    info_s.name = 'sup_info'
    df_party = pd.concat([df_party,info_s],axis=1)
    df_party['Party_Sup'] = 1
    df_party['Party_Sup'][df_party['sup_info']==''] = 0
    return df_party

sup_df = party_sup(party_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [180]:
print(sup_df['Party_Sup'].value_counts())
sup_df[sup_df['Party_Sup']==1].to_excel("C:\\Users\\win10\\Desktop\\宝哥数据\\sup_info.xlsx")

0    38652
1     7664
Name: Party_Sup, dtype: int64


# Whether to serve as top manager

In [181]:
# 编写正则表达式

# 1. xxxx年至今,含有的[现任]信息优先于[任/历任/先后任]
regex1=r'\d{4}年.*至今.*[^曾]任.*((?:董事会秘书)|(?:总裁)|(?:总经理)|(?:总监)).*'

# 2. '[任]...至今'优先级等价于[现任]
regex2=r'(?<!至|\-|—)\d{4}年.*[^曾]任.*((?:董事会秘书)|(?:总裁)|(?:总经理)|(?:总监)).*[至今|。|\.]{2}$'  

# 3. xxxx年起,含有的[现任]信息优先于[任/历任/先后任]
regex3=r'(?<!至|\-|—)\d{4}年.*[^至\-—].*起.*(?<!曾|历|后)任.*((?:董事会秘书)|(?:总裁)|(?:总经理)|(?:总监)).*'  

# 4. '[至/——]报告期末'优先级等价于[现任]
regex4=r'(\d{4}.{1,6})?[至\-—]?报告期末.*[^曾]任.*((?:董事会秘书)|(?:总裁)|(?:总经理)|(?:总监)).{,5}'  

# 5. 现任
regex5=r'现任.*((?:董事会秘书)|(?:总裁)|(?:总经理)|(?:总监)).*'
# 6. 排除匹配到[现任]'不相关职务',历任'相关职务'的特例
regex5a=r'现任.*((?:董事会秘书)|(?:总裁)|(?:总经理)|(?:总监)).*(?=历任|曾任|先后任)' 

# 编译正则表达式
rec1 = re.compile(regex1)
rec2 = re.compile(regex2)
rec3 = re.compile(regex3)
rec4 = re.compile(regex4)
rec5 = re.compile(regex5)
rec5a = re.compile(regex5a)

# 在有党职的前提下提取高管信息
def party_mgr(df):
    df_party = df[df['Party']==1]
    s1 = extract_info(df_party,rec1)
    s2 = extract_info(df_party,rec2)
    s3 = extract_info(df_party,rec3)
    s4 = extract_info(df_party,rec4)
    s5 = extract_info(df_party,rec5)
    info_s = s1+s2+s3+s4+s5
    info_s.name = 'mgr_info'
    df_party = pd.concat([df_party,info_s],axis=1)
    df_party['Party_Mgr'] = 1
    df_party['Party_Mgr'][df_party['mgr_info']==''] = 0
    return df_party

mgr_df = party_mgr(party_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [182]:
print(mgr_df['Party_Mgr'].value_counts())
mgr_df[mgr_df['Party_Mgr']==1].to_excel("C:\\Users\\win10\\Desktop\\宝哥数据\\dir_info.xlsx")

1    23290
0    23026
Name: Party_Mgr, dtype: int64


# Whether to hold concurrent posts in the party

In [183]:
# 编写正则表达式

# 1. xxxx年至今
regex1=r'\d{4}年.*至今.*兼任.*((?:党)|(?:党群)|(?:政协)|(?:人大)).*(?<!秘)[书记|委员|科长|经理|主任]{1,2}'

# 2. 兼任...至今
regex2=r'(?<!至|\-|—)\d{4}年.*兼任.*((?:党)|(?:党群)|(?:政协)|(?:人大)).*(?<!秘)[书记|委员|科长|经理|主任]{1,2}.*[^年月日][至今|。|\.]{2}$'  

# 3. xxxx年起,兼任...
regex3=r'(?<!至|\-|—)\d{4}年.*[^至\-—].*起.*兼任.*((?:党)|(?:党群)|(?:政协)|(?:人大)).*(?<!秘)[书记|委员|科长|经理|主任]{1,2}'  

# 4. '[至/——]报告期末'优先级等价于[现任]
regex4=r'(\d{4}.{1,6})?[至\-—]?报告期末.*兼任.*((?:党)|(?:党群)|(?:政协)|(?:人大)).*(?<!秘)[书记|委员|科长|经理|主任]{1,2}' 

# 5. 兼任...
regex5=r'兼任.*((?:党)|(?:党群)|(?:政协)|(?:人大)).*(?<!秘)[书记|委员|科长|经理|主任]{1,2}.*'
# 6. 排除匹配到[兼任]'不相关职务',历任'相关职务'的特例
regex5a=r'兼任.*((?:党)|(?:党群)|(?:政协)|(?:人大)).*(?<!秘)[书记|委员|科长|经理|主任]{1,2}.*(?=历任|曾任|先后任)' 

# 编译正则表达式
rec1 = re.compile(regex1)
rec2 = re.compile(regex2)
rec3 = re.compile(regex3)
rec4 = re.compile(regex4)
rec5 = re.compile(regex5)
rec5a = re.compile(regex5a)

# 根据正则表达式提取兼任信息
def extract_dual_info(df,regex):
    info = []
    for i in df['Resume']:
        j = re.split('。|；|;|\.',i)
        list_info = []
        for ii in j:
            regx = regex.search(ii)
            if regx and regex!=rec5:
                list_info.append(regx.group())
            elif regx and regex==rec5:
                search = re.search('历任|曾任|先后任',regx.group())
                if search:
                    search_=rec5a.search(regx.group())
                    if search_:
                        list_info.append(search_.group())
                else:
                    list_info.append(regx.group())
        if list_info==[]:
            info.append('')
        else:
            info.append('。'.join(list_info))
    info_series = pd.Series(info,index=df.index)
    return info_series

# 根据公司名字字段是否出现在兼任信息中以判断是否属于同一集团——可能不准确
def party_dual(df):
    df_party = df[df['Party']==1]
    s1 = extract_dual_info(df_party,rec1)
    s2 = extract_dual_info(df_party,rec2)
    s3 = extract_dual_info(df_party,rec3)
    s4 = extract_dual_info(df_party,rec4)
    s5 = extract_dual_info(df_party,rec5)
    info_s = s1+s2+s3+s4+s5
    info_s.name = 'party_dual_info'
    df_party = pd.concat([df_party,info_s],axis=1)
    may_dual=[]
    for i in df_party.index:
        #去掉公司名字中的无意义字段
        firm_info = re.sub('集团|股份|公司','',df_party['Coname'][i])
        # 根据公司名构造正则表达式——公司名中至少出现两个字
        firm_re = '['+firm_info+']' + '{2,6}'
        # 编译正则表达式并匹配
        re_c = re.compile(firm_re)
        match = re_c.search(str(df_party['party_dual_info'][i]))
        if match:
            may_dual.append(1)
        else:
            may_dual.append(0)
    dual_s = pd.Series(may_dual,index=df_party.index,name='party_dual_may')
    df_party = pd.concat([df_party,dual_s],axis=1)
    return df_party

party_dual_df = party_dual(party_df)

In [184]:
print(party_dual_df['party_dual_may'].value_counts())
party_dual_df[party_dual_df['party_dual_info']!=''].to_excel(
    "C:\\Users\\win10\\Desktop\\宝哥数据\\party_dual_info.xlsx")

0    45122
1     1194
Name: party_dual_may, dtype: int64


# Merge the data

In [185]:
from functools import reduce
fill_value = lambda x,y: x.combine_first(y)
final_data = reduce(fill_value,[party_df,dir_df,sup_df,mgr_df,party_dual_df])

# Sort and export data

In [186]:
columns = ['stkcd','Coname','Resume','year','league_info','party_info'] + \
          ['dir_info','sup_info','mgr_info','party_dual_info'] + \
         data.columns.tolist()[4:9] + ['party_dual_may'] + data.columns.tolist()[9:]
final_data = final_data.reindex(columns=columns)
final_data.to_excel("C:\\Users\\win10\\Desktop\\宝哥数据\\mayfinal.xlsx",)