In [1]:
import pandas as pd
import numpy as np
from functools import reduce

In [2]:
#Dropping unnecessary columns
df=pd.read_csv("BL-Flickr-Images-Book.csv")
df.head(2)

Unnamed: 0,Identifier,Edition Statement,Place of Publication,Date of Publication,Publisher,Title,Author,Contributors,Corporate Author,Corporate Contributors,Former owner,Engraver,Issuance type,Flickr URL,Shelfmarks
0,206,,London,1879 [1878],S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,"FORBES, Walter.",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12641.b.30.
1,216,,London; Virtue & Yorston,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.","BLAZE DE BURY, Marie Pauline Rose - Baroness",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12626.cc.2.


In [3]:
#drop unnecessary columns
to_drop= ['Edition Statement',
           'Corporate Author',
           'Corporate Contributors',
           'Former owner',
           'Engraver',
           'Contributors',
           'Issuance type',
           'Shelfmarks']

In [4]:
df.drop(to_drop,inplace=True, axis=1)

In [5]:
df.head(3)

Unnamed: 0,Identifier,Place of Publication,Date of Publication,Publisher,Title,Author,Flickr URL
0,206,London,1879 [1878],S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,http://www.flickr.com/photos/britishlibrary/ta...
1,216,London; Virtue & Yorston,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
2,218,London,1869,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...


In [6]:
#Setting the index of the dataset
df.set_index('Identifier', inplace=True)


In [7]:
#check date of publication and try to standaridzed
df['Date of Publication'].head(25)

Identifier
206            1879 [1878]
216                   1868
218                   1869
472                   1851
480                   1857
481                   1875
519                   1872
667                    NaN
874                   1676
1143                  1679
1280                  1802
1808                  1859
1905                  1888
1929           1839, 38-54
2836                  1897
2854                  1865
2956               1860-63
2957                  1873
3017                  1866
3131                  1899
4598                  1814
4884                  1820
4976                  1800
5382    1847, 48 [1846-48]
5385               [1897?]
Name: Date of Publication, dtype: object

In [8]:
#Cleaning columns using the .apply function
unwanted_characters = ['[', ',', '-']

def clean_dates(item):
    dop= str(item.loc['Date of Publication'])
    
    if dop == 'nan' or dop[0] == '[':
        return np.NaN
    
    for character in unwanted_characters:
        if character in dop:
            character_index = dop.find(character)
            dop = dop[:character_index]
    
    return dop

df['Date of Publication'] = df.apply(clean_dates, axis = 1)

In [9]:
df['Date of Publication'].head(25)

Identifier
206     1879 
216      1868
218      1869
472      1851
480      1857
481      1875
519      1872
667       NaN
874      1676
1143     1679
1280     1802
1808     1859
1905     1888
1929     1839
2836     1897
2854     1865
2956     1860
2957     1873
3017     1866
3131     1899
4598     1814
4884     1820
4976     1800
5382     1847
5385      NaN
Name: Date of Publication, dtype: object

In [10]:
#clean Author Name
df['Author'].head(23)

Identifier
206                                                 A. A.
216                                             A., A. A.
218                                             A., A. A.
472                                             A., E. S.
480                                             A., E. S.
481                                             A., E. S.
519                                             A., F. E.
667                                         A., J.|A., J.
874                                                Remaʿ.
1143                                               A., T.
1280                                                  NaN
1808                                         AALL, Jacob.
1905    AAR, Ermanno - pseud. [i.e. Luigi Giuseppe Oro...
1929                                                  NaN
2836                            ABATE, Giovanni Agostino.
2854                                    ABATI, Francesco.
2956                        ABBADIE, Antoine Thompson d'.
295

In [11]:
def clean_author_names(author):
    author= str(author)
    
    if author== 'Nan':
        return 'Nan'
    author = author.split(',')
    
    if len(author) == 1:
        name = filter(lambda x: x.isalpha(), author[0]) #The isalpha() method returns True if all the characters are alphabet letters (a-z)
        return reduce(lambda x, y: x + y, name)
    
    last_name, first_name = author[0], author[1]

    first_name = first_name[:first_name.find('-')] if '-' in first_name else first_name
    
    if first_name.endswith(('.', '.|')):
        parts = first_name.split('.')
        
        if len(parts) > 1:
            first_occurence = first_name.find('.')
            final_occurence = first_name.find('.', first_occurence + 1)
            first_name = first_name[:final_occurence]
        else:
            first_name = first_name[:first_name.find('.')]
    
    last_name = last_name.capitalize()
    
    return f'{first_name} {last_name}'

In [12]:
df['Author'] = df['Author'].apply(clean_author_names)

In [13]:
df.head(5)

Unnamed: 0_level_0,Place of Publication,Date of Publication,Publisher,Title,Author,Flickr URL
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
206,London,1879,S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,AA,http://www.flickr.com/photos/britishlibrary/ta...
216,London; Virtue & Yorston,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,A. A A.,http://www.flickr.com/photos/britishlibrary/ta...
218,London,1869,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,A. A A.,http://www.flickr.com/photos/britishlibrary/ta...
472,London,1851,James Darling,"Welsh Sketches, chiefly ecclesiastical, to the...",E. S A.,http://www.flickr.com/photos/britishlibrary/ta...
480,London,1857,Wertheim & Macintosh,"[The World in which I live, and my place in it...",E. S A.,http://www.flickr.com/photos/britishlibrary/ta...


In [14]:
#cleaning title:
def clean_title(title):
    
    if title == 'nan':
        return 'NaN'
    
    if title[0] == '[':
        title = title[1: title.find(']')]
        
    if 'by' in title:
        title = title[:title.find('by')]
    elif 'By' in title:
        title = title[:title.find('By')]
        
    if '[' in title:
        title = title[:title.find('[')]

    title = title[:-2]
        
    title = list(map(str.capitalize, title.split()))
    return ' '.join(title)
    
df['Title'] = df['Title'].apply(clean_title)
df.head()

Unnamed: 0_level_0,Place of Publication,Date of Publication,Publisher,Title,Author,Flickr URL
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
206,London,1879,S. Tinsley & Co.,Walter Forbes,AA,http://www.flickr.com/photos/britishlibrary/ta...
216,London; Virtue & Yorston,1868,Virtue & Co.,All For Greed,A. A A.,http://www.flickr.com/photos/britishlibrary/ta...
218,London,1869,"Bradbury, Evans & Co.",Love The Avenger,A. A A.,http://www.flickr.com/photos/britishlibrary/ta...
472,London,1851,James Darling,"Welsh Sketches, Chiefly Ecclesiastical, To The...",E. S A.,http://www.flickr.com/photos/britishlibrary/ta...
480,London,1857,Wertheim & Macintosh,"The World In Which I Live, And My Place In It",E. S A.,http://www.flickr.com/photos/britishlibrary/ta...


In [16]:
#place ofpublication
# if London then London, if oxford then Oxford remove other names alonge ewith
pub = df['Place of Publication']
df['Place of Publication'] = np.where(pub.str.contains('London'), 'London',
    np.where(pub.str.contains('Oxford'), 'Oxford',
        np.where(pub.eq('Newcastle upon Tyne'), #eq(a, b) is equivalent to a == b
            'Newcastle-upon-Tyne', df['Place of Publication'])))

In [18]:
df.tail()

Unnamed: 0_level_0,Place of Publication,Date of Publication,Publisher,Title,Author,Flickr URL
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4158088,London,1838.0,,"The Parochial History Of Cornwall, Founded On",afterwards GILBERT Giddy,http://www.flickr.com/photos/britishlibrary/ta...
4158128,Derby,1831.0,M. Mozley & Son,The History And Gazetteer Of The County Of D,Stephen Glover,http://www.flickr.com/photos/britishlibrary/ta...
4159563,London,,T. Cadell and W. Davies,Magna Britannia; Being A Concise Topographical...,Daniel Lysons,http://www.flickr.com/photos/britishlibrary/ta...
4159587,Newcastle-upon-Tyne,1834.0,Mackenzie & Dent,"An Historical, Topographical And Descriptive V...",E. (Eneas) Mackenzie,http://www.flickr.com/photos/britishlibrary/ta...
4160339,London,1834.0,,Collectanea Topographica Et Genealogica,,http://www.flickr.com/photos/britishlibrary/ta...
