# Load data

In [1]:
import json
import pandas as pd

## The file with big amount of data

Reading the data with pandas library, generates an error.

In [2]:
records = []
for line in open('./data/job7r4A1.json', 'r'):
    records.append(json.loads(line))

len(records)

183407

In [3]:
records[0]

{'isbn': '',
 'ttlfull': ['Die Feist von Kienberg',
  'eine Wasenmeisterfamilie im Ancien Régime zwischen Ehrbarkeit und Delinquenz'],
 'ttlpart': ['Die Feist von Kienberg',
  'eine Wasenmeisterfamilie im Ancien Régime zwischen Ehrbarkeit und Delinquenz'],
 'person': ['Schluchter', 'André', 'André Schluchter'],
 'corporate': [],
 'pubyear': '',
 'decade': '12',
 'century': '12',
 'exactDate': '',
 'edition': '',
 'part': [],
 'pages': [],
 'volumes': 'S. 102-114',
 'pubinit': '',
 'pubword': [],
 'scale': '',
 'coordinate': [],
 'doi': '',
 'ismn': '',
 'musicid': '',
 'format': 'BK020000'}

In [4]:
df = pd.DataFrame(records)

# Extend display to number of columns of DataFrame
pd.options.display.max_columns = len(df.columns)

df.head()

Unnamed: 0,century,coordinate,corporate,decade,doi,edition,exactDate,format,isbn,ismn,musicid,pages,part,person,pubinit,pubword,pubyear,scale,ttlfull,ttlpart,volumes
0,12,[],[],12,,,,BK020000,,,,[],[],"[Schluchter, André, André Schluchter]",,[],,,"[Die Feist von Kienberg, eine Wasenmeisterfami...","[Die Feist von Kienberg, eine Wasenmeisterfami...",S. 102-114
1,123456,[],[],123456,,,,BK020000,,,,[],[],"[Naegeli, Werner, von Werner Naegeli]",,[],,,[Reimereien],[Reimereien],43 S.
2,123,[],[],123,,,,BK020000,,,,[],[],"[Kelly, M.V., Geniesse, J.B., M.V. Kelly et J....",,[],,,[Efficax antidotum ad matrimonia mixta praecav...,[Efficax antidotum ad matrimonia mixta praecav...,75 p.
3,1234567,[],[],1234567,,,,BK020000,,,,[],[],"[Wegelin, Walter, ]",,[],,,"[Probleme der Inflationsbekämpfung, Referat vo...","[Probleme der Inflationsbekämpfung, Referat vo...",24 S.
4,1,[],[],1,,,,BK020000,,,,[],[],"[Oberlin, Urs, Urs Oberlin ; translated from t...",,[],,,[[Poems]],[[Poems]],p. 14-15


# Data analysis

## General functions

In [5]:
def information_filled (d, column_name, filled, empty):

    print('Number of records with filled {:s} {:d}, with missing {:s} {:d} => {:.1f}%'.format(
        column_name, len(filled), column_name, len(empty),
        100*len(filled)/(len(empty)+len(filled))
    ))

    if len(empty) > 0:
        print('\nEMPTY - index', empty[0], '\n')
        print(d.loc[empty[0]])
    else:
        print('\nEMPTY - None')
    print('\nFILLED - index', filled[0], '\n')
    print(d.loc[filled[0]])

    return

In [6]:
def find_empty_in_column (dataFrame, column_name):

    compare, anti_compare = comparison_logic(column_name)

    idx_filled = dataFrame[dataFrame[column_name].apply(compare)].index
    idx_empty = dataFrame[dataFrame[column_name].apply(anti_compare)].index

    information_filled(dataFrame, column_name, idx_filled, idx_empty)

    return idx_filled, idx_empty

In [7]:
# The dictionary of compare logics

list_columns = ['coordinate', 'corporate', 'pages', 'part', 'pubword',
                'ttlfull', 'ttlpart']
array_of_strings_columns = ['person']
strings_columns = ['century', 'doi', 'edition', 'exactDate', 'format', 'isbn',
                   'ismn', 'musicid', 'pubinit', 'pubyear', 'scale', 'volumes']

def comparison_logic (column_name):
    if column_name in (list_columns):
        # Lists
        compare = lambda col: col!=[]
        anti_compare = lambda col: col==[]
    elif column_name in (strings_columns):
        # Strings
        compare = lambda col: col!=''
        anti_compare = lambda col: col==''
    elif column_name in (array_of_strings_columns):
        # Array of Strings
        compare = lambda col: col!=['']
        anti_compare = lambda col: col==['']

    return compare, anti_compare

## Columns

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183407 entries, 0 to 183406
Data columns (total 21 columns):
century       183407 non-null object
coordinate    183407 non-null object
corporate     183407 non-null object
decade        183407 non-null object
doi           183407 non-null object
edition       183407 non-null object
exactDate     183407 non-null object
format        183407 non-null object
isbn          183407 non-null object
ismn          183407 non-null object
musicid       183407 non-null object
pages         183407 non-null object
part          183407 non-null object
person        183407 non-null object
pubinit       183407 non-null object
pubword       183407 non-null object
pubyear       183407 non-null object
scale         183407 non-null object
ttlfull       183407 non-null object
ttlpart       183407 non-null object
volumes       183407 non-null object
dtypes: object(21)
memory usage: 29.4+ MB


In [9]:
cols = df.columns
cols

Index(['century', 'coordinate', 'corporate', 'decade', 'doi', 'edition',
       'exactDate', 'format', 'isbn', 'ismn', 'musicid', 'pages', 'part',
       'person', 'pubinit', 'pubword', 'pubyear', 'scale', 'ttlfull',
       'ttlpart', 'volumes'],
      dtype='object')

### Century

In [10]:
df.century.unique()

array(['12', '123456', '123', '1234567', '1', '', '12345', 'u', '1234',
       '123456789', '12345678',
       '123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstu',
       '2345678', '23', '23456789', '23456', '234', '2345', '234567', '2',
       '23456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstu'],
      dtype=object)

In [11]:
idx_century_filled, idx_century_empty = find_empty_in_column(df, 'century')

Number of records with filled century 166478, with missing century 16929 => 90.8%

EMPTY - index 6 

century                                                        
coordinate                                                   []
corporate              [Christkatholische Kirchgemeinde (Bern)]
decade                                                         
doi                                                            
edition                                                        
exactDate                                                      
format                                                 BK020000
isbn                                                           
ismn                                                           
musicid                                                        
pages                                                        []
part                                                         []
person                                                       []
pub

=> Frage in OneNote.

### Coordinate

In [12]:
idx_coordinate_filled, idx_coordinate_empty = find_empty_in_column(df, 'coordinate')

Number of records with filled coordinate 730, with missing coordinate 182677 => 0.4%

EMPTY - index 0 

century                                                      12
coordinate                                                   []
corporate                                                    []
decade                                                       12
doi                                                            
edition                                                        
exactDate                                                      
format                                                 BK020000
isbn                                                           
ismn                                                           
musicid                                                        
pages                                                        []
part                                                         []
person                    [Schluchter, André, André Schluchter]


For $\texttt{df.volumes}$, see [Volumes](#volumes).

In [13]:
df.coordinate.loc[idx_coordinate_filled].str[0].str[0].unique()

array(['E', 'W'], dtype=object)

In [14]:
df.coordinate.loc[idx_coordinate_filled].str[1].str[0].unique()

array(['N', 'E', 'S'], dtype=object)

In [15]:
df.coordinate.loc[idx_coordinate_filled].str[2].str[0].unique()

array([nan, 'N', 'W', 'E'], dtype=object)

In [16]:
df.coordinate.loc[idx_coordinate_filled].str[3].str[0].unique()

array([nan, 'N'], dtype=object)

In [17]:
df.coordinate.loc[idx_coordinate_filled].str[4].str[0].unique()

array([nan, 'N'], dtype=object)

In [18]:
df.coordinate.loc[idx_coordinate_filled].str[5].str[0].unique()

array([nan, 'N'], dtype=object)

In [19]:
df.loc[idx_coordinate_filled][df.coordinate.loc[idx_coordinate_filled].str[5].str[0] == 'N']

Unnamed: 0,century,coordinate,corporate,decade,doi,edition,exactDate,format,isbn,ismn,musicid,pages,part,person,pubinit,pubword,pubyear,scale,ttlfull,ttlpart,volumes
104201,234567,"[W1244400, E1722600, W1603200, N0492400, N0712...",[Reise Know-How Verlag Peter Rump],234567,,,23456789.0,MP010300,,,,[],[],[],,[],23456789.0,50000012500002000000200000,[USA],[USA],Karten
136954,1,"[E0083620, E0083624, E0083624, N0462105, N0462...","[Schweiz, Bundesamt für Landestopografie]",1,,Aggiornamento completo 1989,,MP010300,3-302-01292-6,,,"[f. 1292, ed. 1989, 1292,1991, 1292,1991, 1292...","[f. 1292, ed. 1989, 1292,1991, 1292,1991, 1292...",[],,[],,250002500025000,[Maggia],[Maggia],1 Karte


### Corporate<a id='corporate'></a>

In [20]:
df.corporate.head()

0    []
1    []
2    []
3    []
4    []
Name: corporate, dtype: object

In [21]:
idx_corporate_filled, idx_corporate_empty = find_empty_in_column(df, 'corporate')

Number of records with filled corporate 33166, with missing corporate 150241 => 18.1%

EMPTY - index 0 

century                                                      12
coordinate                                                   []
corporate                                                    []
decade                                                       12
doi                                                            
edition                                                        
exactDate                                                      
format                                                 BK020000
isbn                                                           
ismn                                                           
musicid                                                        
pages                                                        []
part                                                         []
person                    [Schluchter, André, André Schluchter]

For $\texttt{df.person}$, see [Person](#person).

### Decade

In [22]:
df.decade.unique()

array(['12', '123456', '123', '1234567', '1', '', '12345', 'u', '1234',
       '123456789', '12345678',
       '123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstu',
       '2345678', '23', '23456789', '23456', '234', '2345', '234567', '2',
       '23456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstu'],
      dtype=object)

In [23]:
df[df.century != df.decade]

Unnamed: 0,century,coordinate,corporate,decade,doi,edition,exactDate,format,isbn,ismn,musicid,pages,part,person,pubinit,pubword,pubyear,scale,ttlfull,ttlpart,volumes


### Doi

In [24]:
df.doi[0]

''

In [25]:
idx_doi_filled, idx_doi_empty = find_empty_in_column (df, 'doi')

Number of records with filled doi 19490, with missing doi 163917 => 10.6%

EMPTY - index 0 

century                                                      12
coordinate                                                   []
corporate                                                    []
decade                                                       12
doi                                                            
edition                                                        
exactDate                                                      
format                                                 BK020000
isbn                                                           
ismn                                                           
musicid                                                        
pages                                                        []
part                                                         []
person                    [Schluchter, André, André Schluchter]
pubinit    

In [26]:
df.doi.unique()

array(['', '9790201326702', 'M500173793', ...,
       '10.5167/uzh-16927810.1080/03003930.2019.1584557',
       '10.5167/uzh-169338', '10.1093/eurheartj/ehz068'], dtype=object)

### Edition

In [27]:
df.edition[0]

''

In [28]:
idx_edition_filled, idx_edition_empty = find_empty_in_column (df, 'edition')

Number of records with filled edition 25352, with missing edition 158055 => 13.8%

EMPTY - index 0 

century                                                      12
coordinate                                                   []
corporate                                                    []
decade                                                       12
doi                                                            
edition                                                        
exactDate                                                      
format                                                 BK020000
isbn                                                           
ismn                                                           
musicid                                                        
pages                                                        []
part                                                         []
person                    [Schluchter, André, André Schluchter]
pub

In [29]:
df.edition.unique()

array(['', '2. Aufl', '2.umgearb.Aufl', ..., '1. - 5. Tausend',
       '3rd ed. 2018', 'First Savas Beatie edition'], dtype=object)

### Exact Date

In [30]:
df.exactDate[0]

''

In [31]:
idx_exactDate_filled, idx_exactDate_empty = find_empty_in_column (df, 'exactDate')

Number of records with filled exactDate 31010, with missing exactDate 152397 => 16.9%

EMPTY - index 0 

century                                                      12
coordinate                                                   []
corporate                                                    []
decade                                                       12
doi                                                            
edition                                                        
exactDate                                                      
format                                                 BK020000
isbn                                                           
ismn                                                           
musicid                                                        
pages                                                        []
part                                                         []
person                    [Schluchter, André, André Schluchter]

In [32]:
df.exactDate.unique()

array(['', 'u', '12345', '123456789', '12345678', '1234', '1234567', '1',
       '123456', '123', '2345678', '23456789',
       '123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstu',
       '23456', '12', '234567', '23', '2345', '2', '234',
       '23456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstu'],
      dtype=object)

### Format

In [33]:
df.format.unique()

array(['BK020000', 'BK020800', 'BK020300', 'BK020400', 'MU040100',
       'MU010100', 'VM010000', 'MP010300', 'BK020053', 'BK010000',
       'CR030653', 'CR030300', 'BK020200', 'MU030100', 'BK020700',
       'BK020353', 'MU010000', 'CL010000', 'VM010300', 'MU010100MU010000',
       'MU010000MU010100', 'MU010200MU010000', 'VM010400',
       'BK020000BK020400', 'MU040500', 'MU010200', 'MU010200MU010100',
       'BK020000BK020200', 'BK020800BK020000', 'VM030100VM030000',
       'BK020200BK020000', 'BK020400BK020000', 'BK020347',
       'BK020300BK020000', 'VM010200', 'BK020000BK020800',
       'MP010700MP010300', 'BK020000BK020300', 'VM030100',
       'BK020500BK020000', 'BK020047', 'CF010000', 'VM030000', 'BK020100',
       'BK020100BK020000', 'MP010300MP010300', 'BK020000BK020700',
       'BK020000BK020100', 'CR030600CR030300', 'CR030600',
       'MP010100MP010300', 'CF010100', 'MU010100MU010200', 'BK010053',
       'MU030053', 'BK030000', 'VM020400', 'CL010053', 'VM020000',
       'CR0

In [34]:
len(df.format.unique())

185

In [35]:
idx_format_filled, idx_format_empty = find_empty_in_column (df, 'format')

Number of records with filled format 179688, with missing format 3719 => 98.0%

EMPTY - index 25506 

century                                                23456789
coordinate                                                   []
corporate                                                    []
decade                                                 23456789
doi                                    10.3929/ethz-b-000283465
edition                                                        
exactDate                                                      
format                                                         
isbn                                                           
ismn                                   10.3929/ethz-b-000283465
musicid                                                        
pages                                                        []
part                                                         []
person        [Hager, Pascal Alexander, [Pascal Alexander Ha...
pu

In [36]:
df.format.str[:2].unique()

array(['BK', 'MU', 'VM', 'MP', 'CR', 'CL', 'CF', ''], dtype=object)

In [37]:
df['format_frst2'] = df.format.str[:2]
df[['volumes', 'format_frst2']].loc[idx_coordinate_filled].head()

Unnamed: 0,volumes,format_frst2
1801,1 Karte,MP
1804,1 Karte,MP
1829,1 Karte,MP
1846,1 Karte,MP
1849,1 Karte,MP


In [38]:
df['format_frst2'].loc[idx_coordinate_filled].unique()

array(['MP', 'BK'], dtype=object)

In [39]:
df[['volumes', 'format_frst2']].loc[idx_coordinate_filled][
    df['format_frst2'].loc[idx_coordinate_filled]=='BK']

Unnamed: 0,volumes,format_frst2
102432,9 Blätter,BK


In [40]:
df['volumes'][df.format_frst2=='CL'].head()

1955                1 Mappe
15982    1 article en ligne
16276                      
16668                 35 p.
16669                 37 p.
Name: volumes, dtype: object

### ISBN

In [41]:
df.isbn[0]

''

In [42]:
idx_isbn_filled, idx_isbn_empty = find_empty_in_column (df, 'isbn')

Number of records with filled isbn 80688, with missing isbn 102719 => 44.0%

EMPTY - index 0 

century                                                        12
coordinate                                                     []
corporate                                                      []
decade                                                         12
doi                                                              
edition                                                          
exactDate                                                        
format                                                   BK020000
isbn                                                             
ismn                                                             
musicid                                                          
pages                                                          []
part                                                           []
person                      [Schluchter, André,

### ISMN

In [43]:
df.ismn[0]

''

In [44]:
idx_ismn_filled, idx_ismn_empty = find_empty_in_column (df, 'ismn')

Number of records with filled ismn 19490, with missing ismn 163917 => 10.6%

EMPTY - index 0 

century                                                        12
coordinate                                                     []
corporate                                                      []
decade                                                         12
doi                                                              
edition                                                          
exactDate                                                        
format                                                   BK020000
isbn                                                             
ismn                                                             
musicid                                                          
pages                                                          []
part                                                           []
person                      [Schluchter, André,

### MusicID

In [45]:
df.musicid[0]

''

In [46]:
idx_musicid_filled, idx_musicid_empty = find_empty_in_column (df, 'musicid')

Number of records with filled musicid 13502, with missing musicid 169905 => 7.4%

EMPTY - index 0 

century                                                        12
coordinate                                                     []
corporate                                                      []
decade                                                         12
doi                                                              
edition                                                          
exactDate                                                        
format                                                   BK020000
isbn                                                             
ismn                                                             
musicid                                                          
pages                                                          []
part                                                           []
person                      [Schluchter, A

### Pages

In [47]:
df.pages[0]

[]

In [48]:
idx_pages_filled, idx_pages_empty = find_empty_in_column (df, 'pages')

Number of records with filled pages 43116, with missing pages 140291 => 23.5%

EMPTY - index 0 

century                                                        12
coordinate                                                     []
corporate                                                      []
decade                                                         12
doi                                                              
edition                                                          
exactDate                                                        
format                                                   BK020000
isbn                                                             
ismn                                                             
musicid                                                          
pages                                                          []
part                                                           []
person                      [Schluchter, Andr

In [49]:
df['pages'].loc[idx_pages_filled]

964                                                 [1]
967                                                 [1]
968                                                 [3]
969                                                 [5]
997                                                 [2]
1004                                                [2]
1005                                                [4]
1780                                              [4,1]
1786                                              [4,2]
1790                                         [[Band] 1]
1791                                 [Streichorchester]
1795                                             [2004]
1800                                          [1. Band]
1803                                             [[54]]
1808                                [Volume 2, Reprint]
1809                                        [volume 15]
1812       [37. Jahrgang, Heft 1+2 (2019), Seiten 8-17]
1814                                        [vol

### Part

In [50]:
df.part[0]

[]

In [51]:
idx_part_filled, idx_part_empty = find_empty_in_column (df, 'part')

Number of records with filled part 43116, with missing part 140291 => 23.5%

EMPTY - index 0 

century                                                        12
coordinate                                                     []
corporate                                                      []
decade                                                         12
doi                                                              
edition                                                          
exactDate                                                        
format                                                   BK020000
isbn                                                             
ismn                                                             
musicid                                                          
pages                                                          []
part                                                           []
person                      [Schluchter, André,

In [52]:
df[df.part != df.pages]

Unnamed: 0,century,coordinate,corporate,decade,doi,edition,exactDate,format,isbn,ismn,...,part,person,pubinit,pubword,pubyear,scale,ttlfull,ttlpart,volumes,format_frst2


### Person<a id='person'></a>

In [53]:
df.person.head()

0                [Schluchter, André, André Schluchter]
1                [Naegeli, Werner, von Werner Naegeli]
2    [Kelly, M.V., Geniesse, J.B., M.V. Kelly et J....
3                                  [Wegelin, Walter, ]
4    [Oberlin, Urs, Urs Oberlin ; translated from t...
Name: person, dtype: object

In [54]:
idx_person_filled, idx_person_empty = find_empty_in_column(df, 'person')

Number of records with filled person 164691, with missing person 18716 => 89.8%

EMPTY - index 6 

century                                                          
coordinate                                                     []
corporate                [Christkatholische Kirchgemeinde (Bern)]
decade                                                           
doi                                                              
edition                                                          
exactDate                                                        
format                                                   BK020000
isbn                                                             
ismn                                                             
musicid                                                          
pages                                                          []
part                                                           []
person                                     

For $\texttt{df.corporate}$, see [Corporate](#corporate).

In [55]:
cor_f_per_f = list(set(idx_corporate_filled) & set(idx_person_filled))
cor_f_per_e = list(set(idx_corporate_filled) & set(idx_person_empty))
cor_e_per_f = list(set(idx_corporate_empty) & set(idx_person_filled))
cor_e_per_e = list(set(idx_corporate_empty) & set(idx_person_empty))

print('corporate filled and person filled {:d} => {:.1f}%'.format(
      len(cor_f_per_f), 100*len(cor_f_per_f)/len(df))
     )
print('corporate filled and person empty {:d} => {:.1f}%'.format(
      len(cor_f_per_e), 100*len(cor_f_per_e)/len(df))
     )
print('corporate empty and person filled {:d} => {:.1f}%'.format(
      len(cor_e_per_f), 100*len(cor_e_per_f)/len(df))
     )
print('corporate empty and person empty {:d} => {:.1f}%'.format(
      len(cor_e_per_e), 100*len(cor_e_per_e)/len(df))
     )

corporate filled and person filled 28895 => 15.8%
corporate filled and person empty 4271 => 2.3%
corporate empty and person filled 135796 => 74.0%
corporate empty and person empty 14445 => 7.9%


In [56]:
df.loc[cor_e_per_e[0]]

century                                                    234567
coordinate                                                     []
corporate                                                      []
decade                                                     234567
doi                                                              
edition                                                          
exactDate       23456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_...
format                                                   BK020053
isbn                                                             
ismn                                                             
musicid                                                          
pages                                                          []
part                                                           []
person                                                         []
pubinit                                              AML Editions
pubword   

### Pubinit

In [57]:
df.pubinit[0]

''

In [58]:
idx_pubinit_filled, idx_pubinit_empty = find_empty_in_column(df, 'pubinit')

Number of records with filled pubinit 61462, with missing pubinit 121945 => 33.5%

EMPTY - index 0 

century                                                        12
coordinate                                                     []
corporate                                                      []
decade                                                         12
doi                                                              
edition                                                          
exactDate                                                        
format                                                   BK020000
isbn                                                             
ismn                                                             
musicid                                                          
pages                                                          []
part                                                           []
person                      [Schluchter, 

In [59]:
df.pubinit.unique()

array(['', 'Hippokrates-Verlag', 'Herder', ...,
       'Consultants Bureau.Nauka/Interperiodica', 'en vente chez Paludan',
       'AA-Verlag für Pädagogik'], dtype=object)

In [60]:
df.pubinit.loc[idx_pubinit_filled].head(10)

1976       Hippokrates-Verlag
1977                   Herder
1981          Müller & Schade
1984                    Heyne
1986    Verbandsdruckerei AG.
1991                   Bitter
1994      Deutsche Grammophon
1998           Harmonia Mundi
1999              Schultheiss
2001       Breitkopf & Härtel
Name: pubinit, dtype: object

### Pubword

In [61]:
df.pubword[0]

[]

In [62]:
idx_pubword_filled, idx_pubword_empty = find_empty_in_column(df, 'pubword')

Number of records with filled pubword 61462, with missing pubword 121945 => 33.5%

EMPTY - index 0 

century                                                        12
coordinate                                                     []
corporate                                                      []
decade                                                         12
doi                                                              
edition                                                          
exactDate                                                        
format                                                   BK020000
isbn                                                             
ismn                                                             
musicid                                                          
pages                                                          []
part                                                           []
person                      [Schluchter, 

In [63]:
init_f_word_f = list(set(idx_pubinit_filled) & set(idx_pubword_filled))
init_f_word_e = list(set(idx_pubinit_filled) & set(idx_pubword_empty))
init_e_word_f = list(set(idx_pubinit_empty) & set(idx_pubword_filled))
init_e_word_e = list(set(idx_pubinit_empty) & set(idx_pubword_empty))

print('corporate filled and person filled {:d} => {:.1f}%'.format(
      len(init_f_word_f), 100*len(init_f_word_f)/len(df))
     )
print('corporate filled and person empty {:d} => {:.1f}%'.format(
      len(init_f_word_e), 100*len(init_f_word_e)/len(df))
     )
print('corporate empty and person filled {:d} => {:.1f}%'.format(
      len(init_e_word_f), 100*len(init_e_word_f)/len(df))
     )
print('corporate empty and person empty {:d} => {:.1f}%'.format(
      len(init_e_word_e), 100*len(init_e_word_e)/len(df))
     )

corporate filled and person filled 61462 => 33.5%
corporate filled and person empty 0 => 0.0%
corporate empty and person filled 0 => 0.0%
corporate empty and person empty 121945 => 66.5%


In [64]:
df['pubword'][df['pubinit'] != df['pubword'].str[0]].str[0].dropna().head()

2131                                            Universal
2194    Bibliothèque cantonale et universitaire Lausan...
2281                                         Radio France
2502                                         Silva Screen
2524                                           Caduff, A.
Name: pubword, dtype: object

In [65]:
df[['pubinit', 'pubword']].loc[2281]

pubinit        Radio FranceErato Disques
pubword    [Radio France, Erato Disques]
Name: 2281, dtype: object

### Pubyear

In [66]:
df.pubyear[0]

''

In [67]:
idx_pubyear_filled, idx_pubyear_empty = find_empty_in_column(df, 'pubyear')

Number of records with filled pubyear 31010, with missing pubyear 152397 => 16.9%

EMPTY - index 0 

century                                                        12
coordinate                                                     []
corporate                                                      []
decade                                                         12
doi                                                              
edition                                                          
exactDate                                                        
format                                                   BK020000
isbn                                                             
ismn                                                             
musicid                                                          
pages                                                          []
part                                                           []
person                      [Schluchter, 

In [68]:
df.pubyear.loc[idx_pubyear_filled].unique()

array(['u', '12345', '123456789', '12345678', '1234', '1234567', '1',
       '123456', '123', '2345678', '23456789',
       '123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstu',
       '23456', '12', '234567', '23', '2345', '2', '234',
       '23456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstu'],
      dtype=object)

In [69]:
df[df.pubyear != df.exactDate]

Unnamed: 0,century,coordinate,corporate,decade,doi,edition,exactDate,format,isbn,ismn,...,part,person,pubinit,pubword,pubyear,scale,ttlfull,ttlpart,volumes,format_frst2


### Scale

In [70]:
df.scale[0]

''

In [71]:
idx_scale_filled, idx_scale_empty = find_empty_in_column(df, 'scale')

Number of records with filled scale 793, with missing scale 182614 => 0.4%

EMPTY - index 0 

century                                                        12
coordinate                                                     []
corporate                                                      []
decade                                                         12
doi                                                              
edition                                                          
exactDate                                                        
format                                                   BK020000
isbn                                                             
ismn                                                             
musicid                                                          
pages                                                          []
part                                                           []
person                      [Schluchter, André, 

In [72]:
df.scale.unique()

array(['', '400000', '3000000', '50000', '75000', '300000', '800000',
       '10000', '5650', '1:400,000', '700000', '100000', '25000',
       '275000', '500000500000', '301000', '15000', '16667',
       'Echelle : 1:50 000', '1000000', '250000', '150000',
       'Verschiedene Massstäbe', 'Echelle [ca. 1:440 000]', '5000050000',
       '[Echelle non déterminée]', '200000', '118000', '1000010000',
       '12500', 'Echelle 1:50.000', '5000', '500', 'Echelles diverses',
       'Echelle 1:50 000', '1400000', '1600000', '550000', '1:50 000',
       '1:60 000', '9117', '2000000', '20000100000', '1156015625', '8100',
       '15700', '17400', '810011600', '16666', '500000', '70000',
       'Echelle: 1:25.000', '1:25 000', '125000', '46000',
       'Echelle 1:25.000 ; projection conforme cylindrique à axe oblique',
       'Echelle 1:25.000', '300000700000', '850000', '2750', '1250000',
       '1700000', '750000', '11000000', '17500', '250000250000',
       '24000000', '3500000', '2500000', '240

### Ttlfull<a id='ttlfull'></a>

In [73]:
df.ttlfull[0]

['Die Feist von Kienberg',
 'eine Wasenmeisterfamilie im Ancien Régime zwischen Ehrbarkeit und Delinquenz']

In [74]:
idx_ttlfull_filled, idx_ttlfull_empty = find_empty_in_column(df, 'ttlfull')

Number of records with filled ttlfull 183407, with missing ttlfull 0 => 100.0%

EMPTY - None

FILLED - index 0 

century                                                        12
coordinate                                                     []
corporate                                                      []
decade                                                         12
doi                                                              
edition                                                          
exactDate                                                        
format                                                   BK020000
isbn                                                             
ismn                                                             
musicid                                                          
pages                                                          []
part                                                           []
person                      [

### Ttlpart

In [75]:
df.ttlpart[0]

['Die Feist von Kienberg',
 'eine Wasenmeisterfamilie im Ancien Régime zwischen Ehrbarkeit und Delinquenz']

In [76]:
idx_ttlpart_filled, idx_ttlpart_empty = find_empty_in_column(df, 'ttlpart')

Number of records with filled ttlpart 183407, with missing ttlpart 0 => 100.0%

EMPTY - None

FILLED - index 0 

century                                                        12
coordinate                                                     []
corporate                                                      []
decade                                                         12
doi                                                              
edition                                                          
exactDate                                                        
format                                                   BK020000
isbn                                                             
ismn                                                             
musicid                                                          
pages                                                          []
part                                                           []
person                      [

In [77]:
df[['ttlfull', 'ttlpart']][df.ttlfull != df.ttlpart].head()

Unnamed: 0,ttlfull,ttlpart
38,"[25 Jahre Micafil, 1918-1943 : Jubiläumsschrif...","[25 Jahre Micafil, 1918-1943 : Jubiläumsschrift]"
73,"[Katechismus der katholischen Religion, Kathol...",[Katechismus der katholischen Religion]
113,[Locorum ex iure Romano anteiustiniano ab ince...,[Locorum ex iure Romano anteiustiniano ab ince...
138,"[Die Stationen des heiligen Kreuzwegs, Fasten-...","[Die Stationen des heiligen Kreuzwegs, Fasten-..."
152,"[Cinquante-deux prones sur le décalogue, une t...","[Cinquante-deux prones sur le décalogue, une t..."


In [78]:
for i in range(len(df)):
    if len(df.ttlfull[i]) < len(df.ttlpart[i]):
        print(i)

[$\texttt{Ttlfull}$](#ttlfull) is always longer or equal to $\texttt{Ttlpart}$.

### Volumes<a id='volumes'></a>

In [79]:
df.volumes.loc[idx_coordinate_filled].unique()

array(['1 Karte', '1 Panorama', '1 Karte in 3 Teilen', '1 Stadtplan',
       '1 Atlas (XII, 136 S.)', '2 Karten auf 1 Bl.', '1 carte',
       '1 Karte in 2 Teilen', '1 Karte in zwei Teilen',
       '1 Karte in 2 Teilen auf 1 Blatt', '1 carta geografica',
       '1 Handzeichnung', '1 Karte auf 4 Blättern',
       '1 Atlas (XII, 128, 110 S.)', '1 Atlas', '1 atlas (IX, 400 p.)',
       '2 Karten auf 1 Blatt', '1 Plan', '1 Atlas (303 Seiten)',
       '2 Karten', '3 Karten auf 1 Blatt', '1 Atlas (230 Seiten)',
       '1 Karte auf 2 Blättern', '1 Heft (48 Seiten, 1 Karte)',
       '2 Karten in 3 Teilen auf 1 Blatt', '1 Atlas (186 Bl., 183 S.)',
       '4 Karten auf 1 Blatt', '1 Atlas (262 S.)', '1 gefalt. Karte',
       '1 Karte in 4 Teilen', '5 Karten auf 1 Blatt',
       '6 Karten auf 1 Blatt', '1 map on 2 sheets',
       '1 carte r°-v° plastifiée', '1 Vogelschaukarte',
       '1 Karte auf 6 Blättern', '9 Blätter',
       '1 Atlas (447 Seiten in verschiedenen Seitenzählungen)',
       '1 A

In [80]:
res = [i for i in df.drop(index=idx_coordinate_filled).volumes.unique()
       if (('Karte' in i) or ('Atlas' in i))]
len(res)

118

In [81]:
for j in range(len(res)):
    print(j, res[j])

0 X, 458 S., 16 Taf., 1 Karte
1 LX, 188 S., 10 Taf. : 1 Karte
2 134 Taf. 6 Karten
3 119 S. 1 Karte
4 110, XV S. 657 Bl., S. 658-675, 2 Karten
5 IV, 104 S., 1 Karte
6 327 S. 1 Taf. 2 Karten
7 88 S. 3 Karten
8 20, 168 S. ill. 1 Portr.-Taf. 1 Karte
9 291 S. Kartenskizzen. 16 Taf.
10 62 S., 1 Karte
11 172 S. Kartenskizzen. 3 Karten.
12 1 Spiel (1 Regelheft, 1 Spielplan, 1 Schichtleitertableau, 39 Patientenkarten, 35 Karten "Vage Erinnerung", 35 Karten "Klare Erinnerung", 72 Szenariokarten, 10 Figuren, 24 Versorgungsmarker, 12 Stressmarker, 10 Fortschrittsmarker, 2 Verwarnungsmarker, 1 Telefonplättchen)
13 936, 4403-444 Seiten + 3 Karten
14 1 Karte
15 1 Spiel (42 Fragen-Karten, 27 ja!- und 27 nein!-Karten)
16 1 Mappe (5 gefaltete Plakate, 6 A3-Karten)
17 305 Seiten, 4 Seiten Karten
18 1 Karte auf 2 Blättern
19 1 Atlas (XVI, 352 Seiten)
20 32 Karten
21 55 Karten
22 1 Spiel (1 Anleitung, 1 Spielplan, 1 Würfel, 28 Piratenlager (je 7 pro Farbe), 28 Schiffe (je 7 pro Farbe), 1 Figur, 16 Coco-Kar

In [82]:
df.volumes[0]

'S. 102-114'

In [83]:
idx_volumes_filled, idx_volumes_empty = find_empty_in_column(df, 'volumes')

Number of records with filled volumes 161471, with missing volumes 21936 => 88.0%

EMPTY - index 1810 

century                                                          
coordinate                                                     []
corporate                                                      []
decade                                                           
doi                                                              
edition                                               2nd edition
exactDate                                                        
format                                                   BK020000
isbn                                            978-0-85702-829-7
ismn                                                             
musicid                                                          
pages                                                          []
part                                                           []
person          [Field, Andy, Graham, 

## Duplicates

In [84]:
for c in list_columns:
    target_string = c + '_string'
    print(c)
    df[target_string] = [', '.join(map(str, l)) for l in df[c]]
    
for c in array_of_strings_columns:
    target_string = c + '_string'
    print(c)
    df[target_string] = [', '.join(map(str, l)) for l in df[c]]

coordinate
corporate
pages
part
pubword
ttlfull
ttlpart
person


In [85]:
df.corporate_string.loc[idx_corporate_filled].head()

6                Christkatholische Kirchgemeinde (Bern)
7                               Milwaukee Public Museum
11    Jüdisches Museum der Schweiz, Museum für Völke...
14                                       Brugg (Aargau)
19                Sammlung für Völkerkunde Sankt Gallen
Name: corporate_string, dtype: object

In [99]:
# Discard list columns
df_n = df.drop(columns=list_columns)
df_n = df_n.drop(columns=array_of_strings_columns)

In [106]:
df[df_n.duplicated(keep='first')]

Unnamed: 0,century,coordinate,corporate,decade,doi,edition,exactDate,format,isbn,ismn,...,volumes,format_frst2,coordinate_string,corporate_string,pages_string,part_string,pubword_string,ttlfull_string,ttlpart_string,person_string
18352,12,[],[],12,,,1234567,MP010300,,,...,,MP,,,,,,[Mappemonde d'Honoré d'Autun],[Mappemonde d'Honoré d'Autun],"Honorius, Augustodunensis, (RERO)A012323103,"
18388,12,[],[],12,,,1234567,MP010300,,,...,,MP,,,,,,[Mappemonde de Gautier de Metz],[Mappemonde de Gautier de Metz],"Gossouin, de Metz, (RERO)A000073443,"
18930,1,[],[],1,,,,BK020300,,,...,10 ungezählte Seiten,BK,,,,,,Dissertatio academica de naturæ humanæ inclina...,Dissertatio academica de naturæ humanæ inclina...,"Zwinger, Theodor, 1658-1724, (DE-588)118773321..."
18937,1,[],[],1,,,,BK020000,,,...,18 ungezählte Seiten,BK,,,,,,Vnser/ Der Dieneren vnd Lehreren des göttliche...,Vnser/ Der Dieneren vnd Lehreren des göttliche...,
19462,u,[],[],u,,,u,BK030000,,,...,,BK,,,,,,Vivaldi : L'inganno trionfante in amore Vivald...,Vivaldi : L'inganno trionfante in amore Vivald...,
19903,u,[],[],u,,,u,MU010100,,,...,1 partition,MU,,,,,[s.n.],"Danses hongroises, [pour piano]","Danses hongroises, [pour piano]","Brahms, Johannes, 1833-1897, J. Brahms"
19912,u,[],[],u,,,u,MU010100,,,...,1 partition,MU,,,,,[s.n.],"Danses hongroises, [pour piano]","Danses hongroises, [pour piano]","Brahms, Johannes, 1833-1897, J. Brahms"
19916,u,[],[],u,,,u,MU010100,,,...,1 partition,MU,,,,,[s.n.],"La nativité du Seigneur, [pour orgue]","La nativité du Seigneur, [pour orgue]","Messiaen, Olivier, Olivier Messiaen"
19919,u,[],[],u,,,u,MU010100,,,...,1 partition,MU,,,,,[s.n.],"Danses Hongroises, [pour piano]","Danses Hongroises, [pour piano]","Brahms, Johannes, 1833-1897, Johannes Brahms"
19920,u,[],[],u,,,u,MU010100,,,...,1 partition,MU,,,,,[s.n.],"La nativité du Seigneur, [pour orgue]","La nativité du Seigneur, [pour orgue]","Messiaen, Olivier, Olivier Messiaen"


There are 248 duplicated rows.