In [2]:
# import helpful tools for data analysis and visualization
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns

# pull in the Modernist Libraries dataset
ml = pd.read_csv('ModernistLibraries20171025_UTF-8', header=0)
#View the first few rows of the dataset
ml.head()

Unnamed: 0,BookOwnerVIAFID,BookOwnerName,WorkID,BookID,BookTitle,BookAuthorName,BookPubDetails,BookPubDate,BookNotes,BookCollection,InformationSource
0,97006051,Ernest Hemingway,4650949,25197870,Sinistro and celebration: 19 poems,A.E.T.,1956,1956,,Finca Vigia.,manual entry
1,97006051,Ernest Hemingway,4650967,25198046,Tauromaquia y taurogogía: Algunos concertos bá...,R. Abarquero Durango,"Madrid: Torroba, 1959.",1959,,Finca Vigia.,manual entry
2,97006051,Ernest Hemingway,429839,25198087,We pointed them North; recollections of a cowp...,Edward Charles Abbott,"Norman: University of Oklahoma Press [1955, c1...",1955,,Finca Vigia.,Library of Congress
3,97006051,Ernest Hemingway,429839,25198220,We pointed them north; recollections of a cowp...,Edward Charles Abbott,"New York: Farrar & Rinehart, inc. [c1939]",1939,,Finca Vigia.,Library of Congress
4,97006051,Ernest Hemingway,7785885,25198292,Japan unmasked,Hallett Edward Abend,"New York: I. Washburn, Inc., 1941.",1941,,Finca Vigia.,Library of Congress


In [3]:
print("Modern Libraries Columns:", list(ml.columns))

mlSize = ml.shape

# Remove extra space that appeared before certain column names ([ ... ' BookPubDate', ' BookNotes', ' BookCollection' ... ])
ml.columns = ['BookOwnerVIAFID', 'BookOwnerName', 'WorkID', 'BookID', 'BookTitle', 'BookAuthorName', 'BookPubDetails', 'BookPubDate', 'BookNotes', 'BookCollection', 'InformationSource']
print(ml.columns)

# create dictionaries of indeces associated with arrays that contain subsets of the dataset
totalRows = mlSize[0]

indeces = []
for n in range(0,totalRows):
    indeces += [n]
assert len(indeces) == ml.shape[0]

time = dict.fromkeys(indeces, '') # BookPubDate
pub = dict.fromkeys(indeces, '') # BookPubDetails
owners = dict.fromkeys(indeces, '') # BookOwnerName
authors = dict.fromkeys(indeces, '') # BookAuthorName
books = dict.fromkeys(indeces, 0) # BookID
titles = dict.fromkeys(indeces, 0) # BookTitle
for index,row in ml.iterrows():
    timeValue = str(row['BookPubDate'])
    time[index] = timeValue
    
    pubValue = str(row['BookPubDetails'])
    pub[index] = pubValue
    
    ownersValue = str(row['BookOwnerName'])
    owners[index] = ownersValue
    
    authorsValue = str(row['BookAuthorName'])
    authors[index] = authorsValue
    
    booksValue = int(row['BookID'])
    books[index] = booksValue
    
    titlesValue = str(row['BookTitle'])
    titles[index] = titlesValue


assert len(time) == totalRows, "The length of the publishing date dictionary equal the total number of rows in the dataset"
assert len(pub) == totalRows,  "The length of the publishing details dictionary equal the total number of rows in the dataset"
assert len(owners) == totalRows,  "The length of the owners dictionary equal the total number of rows in the dataset"
assert len(authors) == totalRows,  "The length of the authors dictionary equal the total number of rows in the dataset"
assert len(books) == totalRows,  "The length of the book ID dictionary equal the total number of rows in the dataset"
assert len(titles) == totalRows, "The length of the titles dictionary equal the total number of rows in the dataset"

assert time[0] == ml.iloc[0, 7], "The time dictionary has the right values from the BookPubDate column"
assert pub[200] == ml.iloc[200, 6], "The pub dictionary has the right values from the BookPubDetails column"
assert owners[20020] == ml.iloc[20020, 1], "The owners dict has the right values from BookOwnerName"
assert authors[1001] == ml.iloc[1001, 5], "The authors dict has the right values from BookAuthorName"
assert books[3] == ml.iloc[3, 3], "The books dict has the right values from BookID"

Modern Libraries Columns: ['BookOwnerVIAFID', 'BookOwnerName', 'WorkID', 'BookID', 'BookTitle', 'BookAuthorName', ' BookPubDetails', ' BookPubDate', ' BookNotes', ' BookCollection', 'InformationSource']
Index(['BookOwnerVIAFID', 'BookOwnerName', 'WorkID', 'BookID', 'BookTitle',
       'BookAuthorName', 'BookPubDetails', 'BookPubDate', 'BookNotes',
       'BookCollection', 'InformationSource'],
      dtype='object')


In [8]:
# Capitalize owner names consistently
keysToFix = []
for key in owners:
    if owners[key] is "e.e. cummings":
        keysToFix += [key]
        owners[key] = "E.E. Cummings"
    if owners[key] is "Ezra POUND":
        keysToFix += [key]
        owners[key] = "Ezra Pound"
for k in keysToFix:
    assert owners[k] != "e.e. cummings", "E.E. Cummings' name should be properly capitalized in the dataset"
    assert owners[k] != "Ezra POUND", "Ezra Pound's name should be properly capitalized in the dataset"

In [9]:
# Clean up dates

# Figure out the range of publication dates
pubDates = list(ml.loc[:, 'BookPubDate'])
#print(pubDates)
totalPubDates = len(pubDates)
assert totalPubDates == ml.shape[0], "There should be one BookPubDate value per row in the dataframe"

numeric = []
nonNumeric = []
for date in pubDates:
    date = str(date)
    if str.isdigit(date):
        numeric += [date]
    else:
        nonNumeric += [date]

#for x in numeric:
    #assert len(x) == 4, "All numeric dates should have 4 characters"
    #assert str.isdigit(date), "All numeric dates' characters should be digits"
#assert (len(numeric) + len(nonNumeric)) == len(pubDates), "All dates in the dataset should appear in either numeric or nonNumeric (not both"

print("nonNumeric Dates:", len(nonNumeric))
print("numeric Dates:", len(numeric))

# find book with invalid numeric date
invalidNumericDate = []
for d in numeric:
    if len(d) != 4:
        invalidNumericDate += [d]
print("Invalid Numeric Dates:", invalidNumericDate)
invalidDateIndeces = []
for d in invalidNumericDate:
    invalidDateIndeces += [pubDates.index(str(d))]
#print(invalidDateIndeces)

import re

# investigate non-numeric BookPubDate values
#uniqueNonNumeric = np.unique(nonNumeric)
#for value in uniqueNonNumeric:
    #print(i, ":", value)
    #i += 1
    #print(value)

# define lists to create for new StartYear and EndYear columns
StartYear = []
EndYear = []
for d in pubDates:
    d = str(d)
    d = d.lower()
    d = d.strip('abcdefghijklmnopqrstuv')
    d = d.strip(',')
    d = d.strip('[')
    d = d.strip('?]')
    d = d.strip(']')

    # fix invalid values that are all numeric
    if d == '188' or d == '18888':
        StartYear += [int(1888)]
        EndYear += [int(1888)]
    
    else:
    
        # get all four-digit years into StartYear and EndYear
        y0 = re.search('[1-9]{1}[0-9]{3}$', d)  # example: 1926
        if y0 != None:
            add0 = y0.group(0)
            StartYear += [int(add0)]
            EndYear += [int(add0)]
            if len(add0) != 4:
                print(add0)
    
        # divide ranges of years into StartYear and EndYear
        elif len(d) > 3:
            y1 = re.search('[1-9]{1}[0-9]{3}-[0-9]{1}$', d)  # example: 1990-1
            if y1 != None:
                add1 = y1.group(0)
                s1 = (add1[0]+add1[1]+add1[2]+add1[3])
                e1 = (add1[0]+add1[1]+add1[2]+add1[5])
                StartYear += [int( s1 )]
                EndYear += [int( e1 )]
                if len(s1) != 4:
                    print(s1)
                    print("Add1:", add1)
                if len(e1) != 4:
                    print(e1)
                    print("Add1:", add1)
            else:
                y2 = re.search('[1-9]{1}[0-9]{3}-[0-9]{2}$', d)  # example: 1940-45
                if y2 != None:
                    add2 = y2.group(0)
                    s2 = (add2[0]+add2[1]+add2[2]+add2[3])
                    e2 = (add2[0]+add2[1]+add2[5]+add2[6])
                    StartYear += [int( s2 )]
                    EndYear += [int( e2 )]
                    if len(s2) != 4:
                        print(s2)
                        print("Add2:", add2)
                    if len(e2) != 4:
                        print(e2)
                        print("Add2:", add2)
                else:
                    y3 = re.search('([1-9]{1}[0-9]{3})(-|\/)([1-9]{1}[0-9]{3})', d)  # example: 1930-1940
                    if y3 != None:
                        add3 = y3.group(0)
                        s3 = (add3[0]+add3[1]+add3[2]+add3[3])
                        e3 = (add3[5]+add3[6]+add3[7]+add3[8])
                        StartYear += [int( s3 )]
                        EndYear += [int( e3 )]
                        if len(s3) != 4:
                            print(s3)
                            print("Add3:", add3)
                        if len(e3) != 4:
                            print(e3)
                            print("Add3:", add3)
                    else:

        # create ranges for incomplete years
                        y4 = re.search('\d{2}-{2}$', d)  # example: 19--
                        if y4 != None:
                            add4 = y4.group(0)
                            s4 = (add4[0]+add4[1]+'0'+'0')
                            e4 = (add4[0]+add4[1]+'9'+'9')
                            StartYear += [int( s4 )]
                            EndYear += [int( e4 )]
                            if len(s4) != 4:
                                print(s4)
                                print("Add4:", add4)
                            if len(e4) != 4:
                                print(e4)
                                print("Add4:", add4)
                        else:
                            y5 = re.search('\d{3}-{1}$', d)  # example: 193-
                            if y5 != None:
                                add5 = y5.group(0)
                                s5 = (add5[0]+add5[1]+add5[2]+'0')
                                e5 = (add5[0]+add5[1]+add5[2]+'9')
                                StartYear += [int( s5 )]
                                EndYear += [int( e5 )]
                                if len(s5) != 4:
                                    print(s5)
                                    print("Add5:", add5)
                                if len(e5) != 4:
                                    print(e5)
                                    print("Add5:", add5)

        # make the rest 9999 (will need to compare to pub details or investigate manually)
                            else:
                                StartYear += [int(9999)]
                                EndYear += [int(9999)]
        else:
            StartYear += [int(9999)]
            EndYear += [int(9999)]

for pubDate in StartYear:
    assert type(pubDate) == int, "All start dates should be ints"
    assert len(str(pubDate)) == 4, "All start dates should be four digits long"
for pubDate in EndYear:
    assert type(pubDate) == int, "All end dates should be ints"
    assert len(str(pubDate)) == 4, "All end dates should be four digits long"
assert len(StartYear) == mlSize[0], "There should be one StartYear value for every row in the dataset"
assert len(EndYear) == mlSize[0], "There should be one EndYear value for every row in the dataset"

startToInvestigate = 0
endToInvestigate = 0
validStartYears = []
validEndYears = []
for pubDate in StartYear:
    if pubDate == 0 or pubDate > 2017:
        startToInvestigate += 1
    else:
        validStartYears += [pubDate]

for pubDate in EndYear:
    if pubDate == 0 or pubDate > 2017:
        endToInvestigate += 1
    else:
        validEndYears += [pubDate]

yearRanges = []
singleYear = []
toInvestigate = 0
for i in range(0, (mlSize[0])):
    if (StartYear[i] == 9999) or (EndYear[i] == 9999):
        toInvestigate += 1
    elif StartYear[i] < EndYear[i]:
        yearRanges += [EndYear[i] - StartYear[i]]
    elif StartYear[i] == EndYear[i]:
        singleYear += [StartYear[i]]
    else:
        toInvestigate += 1
print("To Investigate:", toInvestigate)  #I'll drop the rows with these 2353 invalid/unknown dates
print(StartYear[4])

nonNumeric Dates: 2567
numeric Dates: 20784
Invalid Numeric Dates: ['188', '18888']
To Investigate: 2353
1941


In [7]:
# Ernest Hemingway's Library
EHBooks = 0
EHDates = 0
for key in owners:
    x = (owners[key]).lower()
    if "hemingway" in x:
        EHBooks += 1
        pubdate = StartYear[key]
        if (pubdate > 1939) and (pubdate < 1961):
            EHDates += 1
print(EHBooks)
print(EHDates)
print(EHDates/EHBooks)

7408
4487
0.6056965442764579


In [6]:
import networkx as nx

**Using NetworkX**

*from https://networkx.github.io/documentation/networkx-1.10/reference/introduction.html*

* Edges should be directed to indicate whether someone owns a book or whether someone's book was owned by one of the libraries in the dataset
* Multiple edges should be allowed between pairs of nodes, because one owner can have books by many authors and one author can appear in many owners' libraries
* Edge data could include time (publishing dates), location (publishing locations), and count (number of books) however this data is not necessary to understand the relationships between the nodes because each node represents the same relationship: book ownership
* Node data should be hashable, so each person should have a unique name and an attribute that categorizes them as a book author, book owner or both in the Modernist Libraries dataset

The appropriate graph class to use for the Modernist Libraries dataset is thus MultiDiGraph.  Graphs can be created in one of three ways:

* Graph generators – standard algorithms to create network topologies
* Importing data from pre-existing (usually file) sources
* Adding edges and nodes explicitly

As the pre-existing file sources in the tutorial for importing data does not include CSV files, I will add edges and nodes explicitly from the Modernist Libraries dataset to create a MultiDiGraph network.

In [26]:
# Make network nodes for book authors and owners
bookOwnersList = list(ml['BookOwnerName'].unique())
bookAuthorsList = list(ml['BookAuthorName'].unique())
#print("Book Owners:", bookOwnersList)
#print("Book Authors:", bookAuthorsList)
print("Total Book Authors Count:", len(bookAuthorsList))

# Capitalize owner names consistently
bookOwnersList[5] = "E.E. Cummings"
for i in owners:
    if owners[i] is "e.e. cummings":
        owners[i] == "E.E. Cummings"

print("Book Owners:", bookOwnersList)
ppl = bookAuthorsList
for owner in bookOwnersList:
    if owner not in bookAuthorsList:
        ppl += [owner]
print(len(ppl))
indeces = []
for n in range(0,len(ppl)):
    indeces += [n]
assert len(indeces) == len(ppl), "There should be one index per person in the Modernist Libraries dataset"

pplDict = dict.fromkeys(indeces, '')
for i in indeces:
    pplDict[i] = str(ppl[i])
assert len(pplDict) == len(ppl), "There should be one key-value pair per person in the Modernist Libraries dataset"
print(pplDict[1])
print(pplDict[12000])

Total Book Authors Count: 12729
Book Owners: ['Ernest Hemingway', 'James Joyce', 'W.H. Auden', 'Ezra Pound', 'F. Scott Fitzgerald', 'E.E. Cummings', 'W.B. Yeats', "Flannery O'Connor", 'Virginia Woolf', 'Gertrude Stein', 'Edith Sitwell', 'Edna St. Vincent Millay', 'Langston Hughes']
12733
R. Abarquero Durango
St_phane Mallarm_


There will be some node name cleanup to do! (see last print line above)


To create the Modernist Libraries network of relationships between book owners and book authors, I'll:
1. Initialize a graph that allows for directionality to distinguish owner from owned, and allows for multiple relationships (edges) between two nodes (because, for example, owners may own multiple books by a single author)
2. Create nodes for the owners and authors, with attributes indicating whether the person was an owner or author
3. Create edges from owners to authors, with attributes for the title, ID, publishing date and publishing location of the owned book

In [27]:
# Initialize another empty Modernist Libraries network
libs = nx.MultiDiGraph()

# Each node should return {i : {name : '', owner : T/F, author : T/F}}
ownStatus = dict.fromkeys(indeces, 0)
authStatus = dict.fromkeys(indeces, 0)
for i in indeces:
    person = pplDict[i]
    if person in bookOwnersList:
        ownStatus[i] = 1            # if person is book owner, set to 1 (true)
    if person in bookAuthorsList:
        authStatus[i] = 1           # if person is book author, set to 1 (true)
assert len(ownStatus) == len(pplDict), "Ownership dictionary should have one entry per person"
assert len(authStatus) == len(pplDict), "Authorship dictionary should have one entry per person"

for i in indeces:
    person = pplDict[i]
    libs.add_node(person, {'owner' : ownStatus[i], 'author' : authStatus[i]})

print(libs.node["Edward Charles Abbott"])
print("Length:", len(libs))
assert(len(libs) == len(ppl)), "There should be one node per person in the Modernist Libraries network"

{'owner': 0, 'author': 1}
Length: 12733


In [33]:
# Add edges: MultiDiGraph.add_edge(u, v[, key, attr_dict])
# Each edge should have attributes for book title, ID, time (publication date) and location (publishing city)
for i in range(0,totalRows):
    ownerNode = owners[i]
    authorNode = authors[i]
    edgeAttrs = { 'titles' : titles[i], 'book ID' : books[i], 'publication date' : time[i], 'publication city' : pub[i] }
    libs.add_edge(ownerNode, authorNode, object=edgeAttrs)

print("Edges:", libs.number_of_edges())
print("Nodes:", libs.number_of_nodes())

Edges: 23351
Nodes: 12734


Now that the network is built, I can calculate the shortest path from one person to another:

In [34]:
sp = dict(nx.all_pairs_shortest_path(libs))
sp["Ernest Hemingway"]

{'Ernest Hemingway': ['Ernest Hemingway'],
 'A.E.T.': ['Ernest Hemingway', 'A.E.T.'],
 'R. Abarquero Durango': ['Ernest Hemingway', 'R. Abarquero Durango'],
 'Edward Charles Abbott': ['Ernest Hemingway', 'Edward Charles Abbott'],
 'Hallett Edward Abend': ['Ernest Hemingway', 'Hallett Edward Abend'],
 'Edmundo González Acebal': ['Ernest Hemingway', 'Edmundo González Acebal'],
 'Dean Acheson': ['Ernest Hemingway', 'Dean Acheson'],
 'Julián Acuña': ['Ernest Hemingway', 'Julián Acuña'],
 'George Adam': ['Ernest Hemingway', 'George Adam'],
 ' Louis Adamic': ['Ernest Hemingway', ' Louis Adamic'],
 'Charlotte Adams': ['Ernest Hemingway', 'Charlotte Adams'],
 'Cleve F. Adams': ['Ernest Hemingway', 'Cleve F. Adams'],
 'Franklin P. Adams': ['Ernest Hemingway', 'Franklin P. Adams'],
 'Henry Adams': ['Ernest Hemingway', 'Henry Adams'],
 'James Donald Adams': ['Ernest Hemingway', 'James Donald Adams'],
 'James Truslow Adams': ['Ernest Hemingway', 'James Truslow Adams'],
 'Nicholson B. Adams': ['Ern

In [37]:
from networkx.algorithms import approximation
nx.max_clique(libs)

In [48]:
print(len(libs.nodes('Ernest Hemingway')))
print(len(libs.nodes('Langston Hughes')))
print(len(libs.nodes('Louis Henry Cohn')))

12734
12734
12734


**Using Vistorian.net**

I'll create dataframes organized into decades between 1870 and 1969, inclusive (this is the period in which most of the Modernist Libraries' books were published).  If a book's publication date is unknown or invalid, I'll exclude it from the dataframe (roughly 10% of the books in the Modernist Libraries dataset).  If a book has a range of publication dates, I'll simply use the "StartDate" value for the "Year" column for ease of graphing in a network visualization.

In [11]:
# Create a dataframe of owners, and the authors and publication dates of their books
years1870 = []      # years are edge attributes
years1880 = []      # (if an owner has multiple books by one author, they'll have multiple edges between them)
years1890 = []
years1900 = []
years1910 = []
years1920 = []
years1930 = []
years1940 = []
years1950 = []
years1950B = []
years1960 = []
targets1870 = []     # target nodes are authors
targets1880 = []
targets1890 = []
targets1900 = []
targets1910 = []
targets1920 = []
targets1930 = []
targets1940 = []
targets1950 = []
targets1950B = []
targets1960 = []
sources1870 = []     # sources nodes are owners
sources1880 = []
sources1890 = []
sources1900 = []
sources1910 = []
sources1920 = []
sources1930 = []
sources1940 = []
sources1950 = []
sources1950B = []
sources1960 = []
for key in owners:
    yr = StartYear[key]
    if (yr != 9999) and (len(str(yr)) == 4):
        if (yr > 1869) and (yr < 1880):
            sources1870 += [owners[key]]
            years1870 += [int(yr)]
            targets1870 += [authors[key]]
        if (yr > 1879) and (yr < 1890):
            sources1880 += [owners[key]]
            years1880 += [int(yr)]
            targets1880 += [authors[key]]
        if (yr > 1889) and (yr < 1900):
            sources1890 += [owners[key]]
            years1890 += [int(yr)]
            targets1890 += [authors[key]]
        if (yr > 1899) and (yr < 1910):
            sources1900 += [owners[key]]
            years1900 += [int(yr)]
            targets1900 += [authors[key]]
        if (yr > 1909) and (yr < 1920):
            sources1910 += [owners[key]]
            years1910 += [int(yr)]
            targets1910 += [authors[key]]
        if (yr > 1919) and (yr < 1930):
            sources1920 += [owners[key]]
            years1920 += [int(yr)]
            targets1920 += [authors[key]]
        if (yr > 1929) and (yr < 1940):
            sources1930 += [owners[key]]
            years1930 += [int(yr)]
            targets1930 += [authors[key]]
        if (yr > 1939) and (yr < 1950):
            sources1940 += [owners[key]]
            years1940 += [int(yr)]
            targets1940 += [authors[key]]
        if (yr > 1949) and (yr < 1955):
            sources1950 += [owners[key]]
            years1950 += [int(yr)]
            targets1950 += [authors[key]]
        if (yr > 1954) and (yr < 1960):
            sources1950B += [owners[key]]
            years1950B += [int(yr)]
            targets1950B += [authors[key]]
        if (yr > 1959) and (yr < 1970):
            sources1960 += [owners[key]]
            years1960 += [int(yr)]
            targets1960 += [authors[key]]

print("1870s:", len(years1870))
print("1880s:", len(years1880))
print("1890s:", len(years1890))
print("1900s:", len(years1900))
print("1910s:", len(years1910))
print("1920s:", len(years1920))
print("1930s:", len(years1930))
print("1940s:", len(years1940))
print("1950-54:", len(years1950))
print("1955-59:", len(years1950B))
print("1960s:", len(years1960))

df1870s = pd.DataFrame( { 'Book Owner' : sources1870, 'Book Author' : targets1870, 'Publication Year' : years1870 })
df1880s = pd.DataFrame( { 'Book Owner' : sources1880, 'Book Author' : targets1880, 'Publication Year' : years1880 })
df1890s = pd.DataFrame( { 'Book Owner' : sources1890, 'Book Author' : targets1890, 'Publication Year' : years1890 })
df1900s = pd.DataFrame( { 'Book Owner' : sources1900, 'Book Author' : targets1900, 'Publication Year' : years1900 })
df1910s = pd.DataFrame( { 'Book Owner' : sources1910, 'Book Author' : targets1910, 'Publication Year' : years1910 })
df1920s = pd.DataFrame( { 'Book Owner' : sources1920, 'Book Author' : targets1920, 'Publication Year' : years1920 })
df1930s = pd.DataFrame( { 'Book Owner' : sources1930, 'Book Author' : targets1930, 'Publication Year' : years1930 })
df1940s = pd.DataFrame( { 'Book Owner' : sources1940, 'Book Author' : targets1940, 'Publication Year' : years1940 })
df1950sA = pd.DataFrame( { 'Book Owner' : sources1950, 'Book Author' : targets1950, 'Publication Year' : years1950 })
df1950sB = pd.DataFrame( { 'Book Owner' : sources1950B, 'Book Author' : targets1950B, 'Publication Year' : years1950B })
df1960s = pd.DataFrame( { 'Book Owner' : sources1960, 'Book Author' : targets1960, 'Publication Year' : years1960 })

# Export the dataframes as CSV files to create link tables to visualize in vistorian.net
#df1870s.to_csv('owners_authors_1870.csv')
#df1880s.to_csv('owners_authors_1880.csv')
#df1890s.to_csv('owners_authors_1890.csv')
#df1900s.to_csv('owners_authors_1900.csv')
#df1910s.to_csv('owners_authors_1910.csv')
#df1920s.to_csv('owners_authors_1920.csv')
#df1930s.to_csv('owners_authors_1930.csv')
#df1940s.to_csv('owners_authors_1940.csv')
df1950sA.to_csv('owners_authors_1950A.csv')
df1950sB.to_csv('owners_authors_1950B.csv')
#df1960s.to_csv('owners_authors_1960.csv')

1870s: 302
1880s: 491
1890s: 1017
1900s: 1402
1910s: 1633
1920s: 3160
1930s: 3350
1940s: 3235
1950-54: 1778
1955-59: 2128
1960s: 1436


For explanatory purposes in DS4D Assignment 3, we'll be focusing on four people in the Modernist Libraries dataset to tell a story about what literary researchers can get out of the dataset:

* Ernest Hemingway
* Gertrude Stein
* Ezra Pound
* Georges Simenon

One Vistorian network visualization will contain every book these four people owned as recorded in the dataset, and another will contain owners of books written by one of these four people.

In [22]:
# Make network nodes for the four people above, accounting for their presence in the dataset as author or owner
vis1AuthList = []
vis1OwnList = []
vis1DateList = []
vis1TitleList = []

vis1AuthList1800 = []
vis1AuthList1850 = []
vis1AuthList1900 = []
vis1AuthList1925 = []
vis1AuthList1950 = []

vis1OwnList1800 = []
vis1OwnList1850 = []
vis1OwnList1900 = []
vis1OwnList1925 = []
vis1OwnList1950 = []

vis1DateList1800 = []
vis1DateList1850 = []
vis1DateList1900 = []
vis1DateList1925 = []
vis1DateList1950 = []

vis1TitleList1800 = []
vis1TitleList1850 = []
vis1TitleList1900 = []
vis1TitleList1925 = []
vis1TitleList1950 = []

vis2AuthList = []
vis2OwnList = []
vis2DateList = []
vis2TitleList = []

total = len(owners)

for i in range(0, total):
    # ensure owners names are all lowercase to account for any capitalization inconsistencies that may be in the dataset
    o = (str(owners[i])).lower()
    a = (str(authors[i])).lower()
    y = StartYear[i]
    if (y != 9999) and (y > 1800):
        # Books of one another's the four authors owned
        if ("ernest hemingway" in o) or ("gertrude stein" in o) or ("ezra pound" in o) or ("georges simenon" in o):
            if ("ernest hemingway" in a) or ("gertrude stein" in a) or ("ezra pound" in a) or ("georges simenon" in a):
                vis1OwnList += [owners[i]]
                vis1AuthList += [authors[i]]
                vis1DateList += [StartYear[i]]  # rely on the earliest publication date provided if range given in dataset
                vis1TitleList += [titles[i]]
                if (y > 1799) and (y < 1850):
                    vis1OwnList1800 += [owners[i]]
                    vis1AuthList1800 += [authors[i]]
                    vis1DateList1800 += [StartYear[i]]  # rely on the earliest publication date provided if range given in dataset
                    vis1TitleList1800 += [titles[i]]
                if (y > 1849) and (y < 1900):
                    vis1OwnList1850 += [owners[i]]
                    vis1AuthList1850 += [authors[i]]
                    vis1DateList1850 += [StartYear[i]]  
                    vis1TitleList1850 += [titles[i]]
                if (y > 1899) and (y < 1925):
                    vis1OwnList1900 += [owners[i]]
                    vis1AuthList1900 += [authors[i]]
                    vis1DateList1900 += [StartYear[i]]  
                    vis1TitleList1900 += [titles[i]]
                if (y > 1924) and (y < 1950):
                    vis1OwnList1925 += [owners[i]]
                    vis1AuthList1925 += [authors[i]]
                    vis1DateList1925 += [StartYear[i]]  
                    vis1TitleList1925 += [titles[i]]
                if (y > 1949) and (y < 1970):
                    vis1OwnList1950 += [owners[i]]
                    vis1AuthList1950 += [authors[i]]
                    vis1DateList1950 += [StartYear[i]]  
                    vis1TitleList1950 += [titles[i]]

        # All books that one of the four authors wrote
        if ("ernest hemingway" in a) or ("gertrude stein" in a) or ("ezra pound" in a) or ("georges simenon" in a):
            vis2OwnList += [owners[i]]
            vis2AuthList += [authors[i]]
            vis2DateList += [StartYear[i]]  # rely on the earliest publication date provided if range given in dataset
            vis2TitleList += [titles[i]]


assert len(vis1OwnList1800) == len(vis1AuthList1800), "Owner and author lists should be the same length"
assert len(vis1DateList1900) == len(vis1AuthList1900), "Date and author lists should be the same length"
assert len(vis1TitleList1950) == len(vis1OwnList1950), "Title and owner lists should be the same length"
assert len(vis2OwnList) == len(vis2AuthList), "Owner and author lists should be the same length"
assert len(vis2DateList) == len(vis2AuthList), "Date and author lists should be the same length"
assert len(vis2TitleList) == len(vis2OwnList), "Title and owner lists should be the same length"

print(len(vis1OwnList1900))
print(len(vis1DateList1925))
print(len(vis1AuthList1950))
print(len(vis1TitleList))

22
103
50
177


In [23]:
# Create a dataframe for the four owners listed in the previous cell
A3_owners0 = pd.DataFrame( { 'Book Owner' : vis1OwnList, 'Book Author' : vis1AuthList, 'Publication Year' : vis1DateList, 'Book Title' : vis1TitleList})
#A3_owners1 = pd.DataFrame( { 'Book Owner' : vis1OwnList1800, 'Book Author' : vis1AuthList1800, 'Publication Year' : vis1DateList1800, 'Book Title' : vis1TitleList1800})
#A3_owners2 = pd.DataFrame( { 'Book Owner' : vis1OwnList1850, 'Book Author' : vis1AuthList1850, 'Publication Year' : vis1DateList1850, 'Book Title' : vis1TitleList1850})
#A3_owners3 = pd.DataFrame( { 'Book Owner' : vis1OwnList1900, 'Book Author' : vis1AuthList1900, 'Publication Year' : vis1DateList1900, 'Book Title' : vis1TitleList1900})
#A3_owners4 = pd.DataFrame( { 'Book Owner' : vis1OwnList1925, 'Book Author' : vis1AuthList1925, 'Publication Year' : vis1DateList1925, 'Book Title' : vis1TitleList1925})
#A3_owners5 = pd.DataFrame( { 'Book Owner' : vis1OwnList1950, 'Book Author' : vis1AuthList1950, 'Publication Year' : vis1DateList1950, 'Book Title' : vis1TitleList1950})
A3_owners0.head()

Unnamed: 0,Book Author,Book Owner,Book Title,Publication Year
0,Ernest Hemingway,Ernest Hemingway,Hemingway par lui-même,1959
1,Ernest Hemingway,Ernest Hemingway,Men at war; the best war stories of all time,1955
2,Ernest Hemingway,Ernest Hemingway,Men at war; the best war stories of all time,1942
3,Ernest Hemingway,Ernest Hemingway,Men at war : the best war stories of all time,1952
4,Ernest Hemingway,Ernest Hemingway,Le vieil homme et la mer: Transcription dramat...,1955


In [16]:
A3_authors = pd.DataFrame( { 'Book Owner' : vis2OwnList, 'Book Author' : vis2AuthList, 'Publication Year' : vis2DateList, 'Book Title' : vis2TitleList})
A3_authors.tail()

Unnamed: 0,Book Author,Book Owner,Book Title,Publication Year
288,Gertrude Stein,Gertrude Stein,The Gertrude Stein first reader & three plays,1946
289,Gertrude Stein,Edith Sitwell,"A book concluding with As a wife has a cow, a ...",1926
290,Gertrude Stein,Gertrude Stein,Before the Flowers of Friendship Faded Friends...,1931
291,Gertrude Stein,Gertrude Stein,Autobiografia di Alice Toklas,1938
292,Georges Simenon,Edna St. Vincent Millay,The Man Who Watched the Trains Go By,1928


In [25]:
# Export the dataframes as CSV files to create link tables to visualize in vistorian.net
A3_owners0.to_csv('A3_owners.csv')
#A3_owners1.to_csv('A3_owners_1800-1849.csv')
#A3_owners2.to_csv('A3_owners_1850-1899.csv')
#A3_owners3.to_csv('A3_owners_1900-1924.csv')
#A3_owners4.to_csv('A3_owners_1925-1949.csv')
#A3_owners5.to_csv('A3_owners_1950-1970.csv')
#A3_authors.to_csv('A3_authors.csv')

In [36]:
# Books both EH and G Stein owned
Hem_Stein_o = []
Hem_Stein_a = []
Hem_Stein_y = []
Hem_Stein_t = []
for i in range(0, total):
    # ensure owners names are all lowercase to account for any capitalization inconsistencies
    o = (str(owners[i])).lower()
    a = (str(authors[i])).lower()
    y = StartYear[i]                 # rely on the earliest publication date provided if range given in dataset
    if (y != 9999) and (y > 1920) and (y < 1940):         #and (y > 1800) and (y < 1970):
        if ("ernest hemingway" in o) or ("gertrude stein" in o):
            Hem_Stein_o += [o]
            Hem_Stein_a += [a]
            Hem_Stein_y += [y]  
            Hem_Stein_t += [titles[i]]

# Books both EH and G Simenon owned
Hem_Sim_o = []
Hem_Sim_a = []
Hem_Sim_y = []
Hem_Sim_t = []
for i in range(0, total):
    # ensure owners names are all lowercase to account for any capitalization inconsistencies
    o = (str(owners[i])).lower()
    a = (str(authors[i])).lower()
    y = StartYear[i]
    if (y != 9999) and (y > 1920) and (y < 1940):         #and (y > 1800) and (y < 1970):
        if ("ernest hemingway" in o) or ("georges simenon" in o):
            Hem_Sim_o += [o]
            Hem_Sim_a += [a]
            Hem_Sim_y += [y] 
            Hem_Sim_t += [titles[i]]

# Books both EH and EP owned
Hem_P_o = []
Hem_P_a = []
Hem_P_y = []
Hem_P_t = []
for i in range(0, total):
    # ensure owners names are all lowercase to account for any capitalization inconsistencies
    o = (str(owners[i])).lower()
    a = (str(authors[i])).lower()
    y = StartYear[i]
    if (y != 9999) and (y > 1920) and (y < 1940):         #and (y > 1800) and (y < 1970):
        if ("ernest hemingway" in o) or ("ezra pound" in o):
            Hem_P_o += [o]
            Hem_P_a += [a]
            Hem_P_y += [y] 
            Hem_P_t += [titles[i]]
print(len(Hem_Stein_o))
print(len(Hem_Sim_a))
print(len(Hem_P_y))

1758
1535
1774


In [37]:
# Create shared ownership dataframes
EH_GStein_1920_40 = pd.DataFrame( { 'Book Owner' : Hem_Stein_o, 'Book Author' : Hem_Stein_a, 'Publication Year' : Hem_Stein_y, 'Book Title' : Hem_Stein_t})
EH_GSim_1920_40 = pd.DataFrame( { 'Book Owner' : Hem_Sim_o, 'Book Author' : Hem_Sim_a, 'Publication Year' : Hem_Sim_y, 'Book Title' : Hem_Sim_t})
EH_EP_1920_40 = pd.DataFrame( { 'Book Owner' : Hem_P_o, 'Book Author' : Hem_P_a, 'Publication Year' : Hem_P_y, 'Book Title' : Hem_P_t})

# Export shared ownership dataframes as CSV files to create link tables to visualize in vistorian.net
#EH_GStein.to_csv('shared_EH_GStein.csv')
#EH_GSim.to_csv('shared_EH_GSim.csv')
#EH_EP.to_csv('shared_EH_EP.csv')

EH_GStein_1920_40.to_csv('shared_EH_GStein_1920-40.csv')
EH_GSim_1920_40.to_csv('shared_EH_GSim_1920-40.csv')
EH_EP_1920_40.to_csv('shared_EH_EP_1920-40.csv')

In [40]:
print(A3_owners1.shape)
print(A3_owners2.shape)
print(A3_owners3.shape)
print(A3_authors.shape)

(63, 4)
(459, 4)
(2891, 4)
(293, 4)


In [42]:
GS = []
for key in authors:
    o = (authors[key]).lower()
    if ("georges" in o) or ("simenon" in o):
        GS += [key]
print(len(GS))

113


In [44]:
GStein = []
for key in authors:
    a = (authors[key]).lower()
    if ("gertrude stein" in a):
        GStein += [key]
for key in owners:
    o = (owners[key]).lower()
    if ("gertrude stein" in o):
        GStein += [key]
print(len(GStein))

990


In [48]:
EH = []
for key in authors:
    a = (authors[key]).lower()
    if ("hemingway" in a):
        EH += [key]
for key in owners:
    o = (owners[key]).lower()
    if ("hemingway" in o):
        EH += [key]
print(len(EH))

7445


In [47]:
EP = []
for key in authors:
    a = (authors[key]).lower()
    if ("e" in a) and ("pound" in a):
        EP += [key]
for key in owners:
    o = (owners[key]).lower()
    if ("e" in o) and ("pound" in o):
        EP += [key]
print(len(EP))

891


In [50]:
pub1940s = []
for d in StartYear:
    if (d < 1950) and (d > 1939):
        pub1940s += [d]
print(len(pub1940s))

3235


In [51]:
pub = []
for d in vis1DateList1900:
    if (d < 1950) and (d > 1939):
        pub += []

In [52]:
print(len(pub))

0


In [53]:
pub2 = []
for d in vis2DateList:
    if (d < 1950) and (d > 1939):
        pub2 += []
print(len(pub2))

0


Determine how many books were common across the following pairs of libraries:
* Ernest Hemingway and Ezra Pound
* Ernest Hemingway and Gertrude Stein

Determine how many books written by one another that the following people owned in their libraries:
* Ernest Hemingway and Ezra Pound
* Ernest Hemingway and Gertrude Stein
* Ernest Hemingway and Georges Simenon

In [None]:
# Make network nodes for the four people above, accounting for their presence in the dataset as author or owner
vis1AuthList1800 = []
vis1AuthList1850 = []
vis1AuthList1900 = []
vis1OwnList1800 = []
vis1OwnList1850 = []
vis1OwnList1900 = []
vis1DateList1800 = []
vis1DateList1850 = []
vis1DateList1900 = []
vis1TitleList1800 = []
vis1TitleList1850 = []
vis1TitleList1900 = []
vis2AuthList = []
vis2OwnList = []
vis2DateList = []
vis2TitleList = []
total = len(owners)
for i in range(0, total):
    # ensure owners names are all lowercase to account for any capitalization inconsistencies that may be in the dataset
    o = (str(owners[i])).lower()
    y = StartYear[i]
    if (y != 9999) and (y > 1800):
        if ("ernest hemingway" in o) or ("gertrude stein" in o) or ("ezra pound" in o) or ("georges simenon" in o):
            if (y > 1799) and (y < 1850):
                vis1OwnList1800 += [owners[i]]
                vis1AuthList1800 += [authors[i]]
                vis1DateList1800 += [StartYear[i]]  # rely on the earliest publication date provided if range given in dataset
                vis1TitleList1800 += [titles[i]]
            if (y > 1849) and (y < 1900):
                vis1OwnList1850 += [owners[i]]
                vis1AuthList1850 += [authors[i]]
                vis1DateList1850 += [StartYear[i]]  
                vis1TitleList1850 += [titles[i]]
            if (y > 1899) and (y < 1942):           
                vis1OwnList1900 += [owners[i]]
                vis1AuthList1900 += [authors[i]]
                vis1DateList1900 += [StartYear[i]]  
                vis1TitleList1900 += [titles[i]]



        a = (str(authors[i])).lower()
        if ("ernest hemingway" in a) or ("gertrude stein" in a) or ("ezra pound" in a) or ("georges simenon" in a):
            vis2OwnList += [owners[i]]
            vis2AuthList += [authors[i]]
            vis2DateList += [StartYear[i]]  # rely on the earliest publication date provided if range given in dataset
            vis2TitleList += [titles[i]]


assert len(vis1OwnList) == len(vis1AuthList), "Owner and author lists should be the same length"
#assert len(vis1DateList) == len(vis1AuthList), "Date and author lists should be the same length"
assert len(vis1TitleList) == len(vis1OwnList), "Title and owner lists should be the same length"
assert len(vis2OwnList) == len(vis2AuthList), "Owner and author lists should be the same length"
assert len(vis2DateList) == len(vis2AuthList), "Date and author lists should be the same length"
assert len(vis2TitleList) == len(vis2OwnList), "Title and owner lists should be the same length"