In [1]:
# import helpful tools for data analysis and visualization
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns

# pull in the Modernist Libraries dataset
ml = pd.read_csv('ModernistLibraries20171025_UTF-8', header=0)
#View the first few rows of the dataset
ml.head()

Unnamed: 0,BookOwnerVIAFID,BookOwnerName,WorkID,BookID,BookTitle,BookAuthorName,BookPubDetails,BookPubDate,BookNotes,BookCollection,InformationSource
0,97006051,Ernest Hemingway,4650949,25197870,Sinistro and celebration: 19 poems,A.E.T.,1956,1956,,Finca Vigia.,manual entry
1,97006051,Ernest Hemingway,4650967,25198046,Tauromaquia y taurogogía: Algunos concertos bá...,R. Abarquero Durango,"Madrid: Torroba, 1959.",1959,,Finca Vigia.,manual entry
2,97006051,Ernest Hemingway,429839,25198087,We pointed them North; recollections of a cowp...,Edward Charles Abbott,"Norman: University of Oklahoma Press [1955, c1...",1955,,Finca Vigia.,Library of Congress
3,97006051,Ernest Hemingway,429839,25198220,We pointed them north; recollections of a cowp...,Edward Charles Abbott,"New York: Farrar & Rinehart, inc. [c1939]",1939,,Finca Vigia.,Library of Congress
4,97006051,Ernest Hemingway,7785885,25198292,Japan unmasked,Hallett Edward Abend,"New York: I. Washburn, Inc., 1941.",1941,,Finca Vigia.,Library of Congress


In [5]:
print("Modern Libraries Columns:", list(ml.columns))

mlSize = ml.shape

# Remove extra space that appeared before certain column names ([ ... ' BookPubDate', ' BookNotes', ' BookCollection' ... ])
ml.columns = ['BookOwnerVIAFID', 'BookOwnerName', 'WorkID', 'BookID', 'BookTitle', 'BookAuthorName', 'BookPubDetails', 'BookPubDate', 'BookNotes', 'BookCollection', 'InformationSource']
print(ml.columns)

# create dictionaries of indeces associated with arrays that contain subsets of the dataset
totalRows = mlSize[0]

indeces = []
for n in range(0,totalRows):
    indeces += [n]
assert len(indeces) == ml.shape[0]

time = dict.fromkeys(indeces, '') # BookPubDetails, BookPubDate
pub = dict.fromkeys(indeces, '') # BookPubDetails
owners = dict.fromkeys(indeces, '') # BookOwnerName
authors = dict.fromkeys(indeces, '') # BookAuthorName
books = dict.fromkeys(indeces, 0) # BookID
for index,row in ml.iterrows():
    timeValue = str(row['BookPubDate'])
    time[index] = timeValue
    
    pubValue = str(row['BookPubDetails'])
    pub[index] = pubValue
    
    ownersValue = str(row['BookOwnerName'])
    owners[index] = ownersValue
    
    authorsValue = str(row['BookAuthorName'])
    authors[index] = authorsValue
    
    booksValue = int(row['BookID'])
    books[index] = booksValue


assert len(time) == totalRows, "The length of the time dictionary equal the total number of rows in the dataset"
assert len(pub) == totalRows,  "The length of the time dictionary equal the total number of rows in the dataset"
assert len(owners) == totalRows,  "The length of the time dictionary equal the total number of rows in the dataset"
assert len(authors) == totalRows,  "The length of the time dictionary equal the total number of rows in the dataset"
assert len(books) == totalRows,  "The length of the time dictionary equal the total number of rows in the dataset"

assert time[0] == ml.iloc[0, 7], "The time dictionary has the right values from the BookPubDate column"
assert pub[200] == ml.iloc[200, 6], "The pub dictionary has the right values from the BookPubDetails column"
assert owners[20020] == ml.iloc[20020, 1], "The owners dict has the right values from BookOwnerName"
assert authors[1001] == ml.iloc[1001, 5], "The authors dict has the right values from BookAuthorName"
assert books[3] == ml.iloc[3, 3], "The books dict has the right values from BookID"

Modern Libraries Columns: ['BookOwnerVIAFID', 'BookOwnerName', 'WorkID', 'BookID', 'BookTitle', 'BookAuthorName', 'BookPubDetails', 'BookPubDate', 'BookNotes', 'BookCollection', 'InformationSource']
Index(['BookOwnerVIAFID', 'BookOwnerName', 'WorkID', 'BookID', 'BookTitle',
       'BookAuthorName', 'BookPubDetails', 'BookPubDate', 'BookNotes',
       'BookCollection', 'InformationSource'],
      dtype='object')


In [7]:
# Clean up dates

# Figure out the range of publication dates
pubDates = list(ml.loc[:, 'BookPubDate'])
#print(pubDates)
totalPubDates = len(pubDates)
assert totalPubDates == ml.shape[0], "There should be one BookPubDate value per row in the dataframe"

numeric = []
nonNumeric = []
for date in pubDates:
    date = str(date)
    if str.isdigit(date):
        numeric += [date]
    else:
        nonNumeric += [date]

#for x in numeric:
    #assert len(x) == 4, "All numeric dates should have 4 characters"
    #assert str.isdigit(date), "All numeric dates' characters should be digits"
#assert (len(numeric) + len(nonNumeric)) == len(pubDates), "All dates in the dataset should appear in either numeric or nonNumeric (not both"

print("nonNumeric Dates:", len(nonNumeric))
print("numeric Dates:", len(numeric))

# find book with invalid numeric date
invalidNumericDate = []
for d in numeric:
    if len(d) != 4:
        invalidNumericDate += [d]
print("Invalid Numeric Dates:", invalidNumericDate)
invalidDateIndeces = []
for d in invalidNumericDate:
    invalidDateIndeces += [pubDates.index(str(d))]
print(invalidDateIndeces)

import re

# investigate non-numeric BookPubDate values
uniqueNonNumeric = np.unique(nonNumeric)
for value in uniqueNonNumeric:
    #print(i, ":", value)
    #i += 1
    print(value)

# define lists to create for new StartYear and EndYear columns
StartYear = []
EndYear = []
for d in pubDates:
    d = str(d)
    d = d.lower()
    d = d.strip('abcdefghijklmnopqrstuv')
    d = d.strip(',')
    d = d.strip('[')
    d = d.strip('?]')
    d = d.strip(']')

    # fix invalid values that are all numeric
    if d == '188' or d == '18888':
        StartYear += [int(1888)]
        EndYear += [int(1888)]
    
    else:
    
        # get all four-digit years into StartYear and EndYear
        y0 = re.search('[1-9]{1}[0-9]{3}$', d)  # example: 1926
        if y0 != None:
            add0 = y0.group(0)
            StartYear += [int(add0)]
            EndYear += [int(add0)]
            if len(add0) != 4:
                print(add0)
    
        # divide ranges of years into StartYear and EndYear
        elif len(d) > 3:
            y1 = re.search('[1-9]{1}[0-9]{3}-[0-9]{1}$', d)  # example: 1990-1
            if y1 != None:
                add1 = y1.group(0)
                s1 = (add1[0]+add1[1]+add1[2]+add1[3])
                e1 = (add1[0]+add1[1]+add1[2]+add1[5])
                StartYear += [int( s1 )]
                EndYear += [int( e1 )]
                if len(s1) != 4:
                    print(s1)
                    print("Add1:", add1)
                if len(e1) != 4:
                    print(e1)
                    print("Add1:", add1)
            else:
                y2 = re.search('[1-9]{1}[0-9]{3}-[0-9]{2}$', d)  # example: 1940-45
                if y2 != None:
                    add2 = y2.group(0)
                    s2 = (add2[0]+add2[1]+add2[2]+add2[3])
                    e2 = (add2[0]+add2[1]+add2[5]+add2[6])
                    StartYear += [int( s2 )]
                    EndYear += [int( e2 )]
                    if len(s2) != 4:
                        print(s2)
                        print("Add2:", add2)
                    if len(e2) != 4:
                        print(e2)
                        print("Add2:", add2)
                else:
                    y3 = re.search('([1-9]{1}[0-9]{3})(-|\/)([1-9]{1}[0-9]{3})', d)  # example: 1930-1940
                    if y3 != None:
                        add3 = y3.group(0)
                        s3 = (add3[0]+add3[1]+add3[2]+add3[3])
                        e3 = (add3[5]+add3[6]+add3[7]+add3[8])
                        StartYear += [int( s3 )]
                        EndYear += [int( e3 )]
                        if len(s3) != 4:
                            print(s3)
                            print("Add3:", add3)
                        if len(e3) != 4:
                            print(e3)
                            print("Add3:", add3)
                    else:

        # create ranges for incomplete years
                        y4 = re.search('\d{2}-{2}$', d)  # example: 19--
                        if y4 != None:
                            add4 = y4.group(0)
                            s4 = (add4[0]+add4[1]+'0'+'0')
                            e4 = (add4[0]+add4[1]+'9'+'9')
                            StartYear += [int( s4 )]
                            EndYear += [int( e4 )]
                            if len(s4) != 4:
                                print(s4)
                                print("Add4:", add4)
                            if len(e4) != 4:
                                print(e4)
                                print("Add4:", add4)
                        else:
                            y5 = re.search('\d{3}-{1}$', d)  # example: 193-
                            if y5 != None:
                                add5 = y5.group(0)
                                s5 = (add5[0]+add5[1]+add5[2]+'0')
                                e5 = (add5[0]+add5[1]+add5[2]+'9')
                                StartYear += [int( s5 )]
                                EndYear += [int( e5 )]
                                if len(s5) != 4:
                                    print(s5)
                                    print("Add5:", add5)
                                if len(e5) != 4:
                                    print(e5)
                                    print("Add5:", add5)

        # make the rest 0 (will need to compare to pub details or investigate online)
                            else:
                                StartYear += [int(9999)]
                                EndYear += [int(9999)]
        else:
            StartYear += [int(9999)]
            EndYear += [int(9999)]

for pubDate in StartYear:
    assert type(pubDate) == int, "All start dates should be ints"
    assert len(str(pubDate)) == 4, "All start dates should be four digits long"
for pubDate in EndYear:
    assert type(pubDate) == int, "All end dates should be ints"
    assert len(str(pubDate)) == 4, "All end dates should be four digits long"
assert len(StartYear) == mlSize[0], "There should be one StartYear value for every row in the dataset"
assert len(EndYear) == mlSize[0], "There should be one EndYear value for every row in the dataset"

startToInvestigate = 0
endToInvestigate = 0
validStartYears = []
validEndYears = []
for pubDate in StartYear:
    if pubDate == 0 or pubDate > 2017:
        startToInvestigate += 1
    else:
        validStartYears += [pubDate]

for pubDate in EndYear:
    if pubDate == 0 or pubDate > 2017:
        endToInvestigate += 1
    else:
        validEndYears += [pubDate]

yearRanges = []
singleYear = []
toInvestigate = 0
for i in range(0, (mlSize[0])):
    if (StartYear[i] == 9999) or (EndYear[i] == 9999):
        toInvestigate += 1
    elif StartYear[i] < EndYear[i]:
        yearRanges += [EndYear[i] - StartYear[i]]
    elif StartYear[i] == EndYear[i]:
        singleYear += [StartYear[i]]
    else:
        toInvestigate += 1

nonNumeric Dates: 2567
numeric Dates: 20784
Invalid Numeric Dates: ['188', '18888']
[18497, 18812]
17 Nov 192
1784-1788
1784-1789
1795-1830
18 January 1815.
18--
1809-1812
1820?
1835-1843
1836-37
1845-1971
1849-1861
1850-1856
1851-1852
1851-1860
1853-1855
1853-1870
1854-1855
1856-1875
1862-1868.
1863-1887
1864-1870
1871-1884
1875-1880
1877-1878
1877-1883
1877-83
1878-1885
1878-1890
1880-1918
1882-1884
1882-1904
1883-1902
1883-1905
1885-1886
1885-1901
1885-86
1886-1889
1887-1898
1887-1902
1887/1890
189-
1892-1900
1893-1898
1893-1899
1896-1942
1897-1900
1897-1905
1897-1924
1898-1899
1898-1905
1899-1900
1899-1904
19--
190-
1900-1904
1901-1918
1902-05
1902?
1903-1905
1903-1916
1903?
1904-1907
1906-10.
1906-1907
191-
1913?
1914-1919
1915-17
1915?
1916-1917
1916-21
1919-21
192-
1922-1924
1922-23
1923-1931
1924-1934
1924-35
1924-38
1925 [?]
1926-1929
1927-
1928-29
1928-30
193-
1930-1
1930-1931
1931-34
1933-1938
1933-1950
1934?
1935-1951
1935-1961
1935-1984
1935?
1936-1937
1937-38
1938-41
1938

In [8]:
import networkx as nx

**Using NetworkX**

*from https://networkx.github.io/documentation/networkx-1.10/reference/introduction.html*

* Edges should be directed to indicate whether someone owns a book or whether someone's book was owned by one of the libraries in the dataset
* Multiple edges should be allowed between pairs of nodes, because one owner can have books by many authors and one author can appear in many owners' libraries
* Edge data could include time and location, however its not necessary to understand the relationships between the nodes because each node represents the same relationship: book ownership
* Node data should be hashable, so each book needs to be given a unique ID and each author can be uniquely identified by the BookOwnerVIAFID

The appropriate graph class to use for the Modernist Libraries dataset is thus MultiDiGraph.  Graphs can be created in one of three ways:

* Graph generators – standard algorithms to create network topologies
* Importing data from pre-existing (usually file) sources
* Adding edges and nodes explicitly

As the pre-existing file sources in the tutorial for importing data does not include CSV files, I will add edges and nodes explicitly from the Modernist Libraries dataset to create a MultiDiGraph network.