In [None]:
import pandas as pd

In [None]:
#Note that for this to work the NoMoreSilence_ProjectData.tsv file needs to be
#in the same directory (folder) that this notebook file is in, and that you started
#the jupyter notebook from.

df = pd.read_csv('NoMoreSilence_ProjectData.tsv', sep='\t')

In [None]:
#to show us the columns of the data we're working with

df.columns

In [None]:
#taking a look at 'Source' column as a way to potentially pull out something to categorize by
#collection, like the call number. This looks pretty good, let's use it. 

df['Source'][0:10]

In [None]:
#this is creating a variable of all the sources, which we'll use to experiment with pulling out
#just the call number for each collection. 

#the for loop will iterate through each source, and use the .split method to create a new list
#with each element (separated by commas, which we specified with .split(', ') -- note 
# comma then space) as a list item.

#this threw an error, because one of the entries was a float not a string. We have to choose to 
#either make it a string or to ignore it. In this code I've made it a string with 
#source = str(source), but it may actually be better to ignore it. (with an if else statement)

sources = df['Source']
for source in sources:
    source = str(source)
    s_list = source.split(', ')
    print(s_list)

In [None]:
#Filling out the above to more completely get the call number. Note that this time we are opting 
#to skip the row if it has no data for the source, this is contained in the 
#"if type(source) == str:"

collection_list = []
for source in sources:
    if type(source) == str:
        source_list = source.split(', ')
        try:
            if source_list[1] == '':
                collection_list.append(source_list[2])
            else:
                collection_list.append(source_list[1])
        except IndexError:
            collection_list.append('no data')

#the below makes a set from the list, to pull out all the unique values so we can see what
#the extent of the values we're getting. 
#we can see that there are some duplicates due to trailing spaces, so we'll need to fix that. 
            
collection_set = set(collection_list)
print(sorted(collection_set))

In [None]:
#We're almost there, but we want the code to remove trailing spaces and to replace spaces
#with dashes for cleaner data.The below does that.

collection_list = []
for source in sources:
    call_no = 'blank'
    if type(source) == str:
        source_list = source.split(', ')
        try:
            if source_list[1] == '':
                if source_list[2][-1] == ' ':
                    call_no = source_list[2][0:-1].replace(' ', '-')
                else:
                    call_no = source_list[2].replace(' ', '-')
            else:
                if source_list[1][-1] == ' ':
                    call_no = source_list[1][0:-1].replace(' ', '-')
                else:
                    call_no = source_list[1].replace(' ', '-')
        except IndexError:
            call_no = 'no-data'
    collection_list.append(call_no)
    
collection_set = set(collection_list)
print(sorted(collection_set))
        

In [None]:
#Now we need to take the code above and turn it into a function that will run on the "Source" 
#field for every line in the dataframe. We need to define its inputs a little differently,
#and do the function definition.

def get_call_no(row):
    call_no = 'blank'
    if type(row['Source']) == str:
        source_list = row['Source'].split(', ')
        try:
            if source_list[1] == '':
                if source_list[2][-1] == ' ':
                    call_no = source_list[2][0:-1].replace(' ', '-')
                else:
                    call_no = source_list[2].replace(' ', '-')
            else:
                if source_list[1][-1] == ' ':
                    call_no = source_list[1][0:-1].replace(' ', '-')
                else:
                    call_no = source_list[1].replace(' ', '-')
        except IndexError:
            call_no = 'no-data'
    return call_no

In [None]:
#This used the function we just defined above to go through each row in the dataframe and
#pull out the call_no and put it into a new column called 'call_no', which we've defined 
#simply by naming it in the 'df['call_no'] = ...'

df['call_no'] = df.apply(lambda row: get_call_no(row), axis=1)

In [None]:
#if we simply call the dataframe now we can see that our new column is there:

df

In [None]:
#using the df.unique method, we can check the same thing we did above using set() -- 
#that there are no repeat values. 

call_nums = df['call_no'].unique()
print(call_nums)

In [None]:
#and now we can sort it by collection simply by creating a variable that defines
#all the rows that match a certain collection value, and passing this as a selection 
#of the df variable:

act_up = df['call_no'] == 'MSS-98-47'
df[act_up]

In [None]:
#or, we can simply define the selection right inside the brackets of df:

df[df['call_no'] == 'MSS-98-47']