# ETD Batch Create

In [2]:
import pandas as pd
import glob
import os
from zipfile import ZipFile

## Creates batch load data for Dspace
### Input files  
--------------------- 
DISCUSS: Ingest process -- should we:  
- process the Graduate School spreadsheet in OpenRefine, and add additional columns/metadata that we need FIRST, or
- set the script up to process the GS spreadsheet as-is when we receive it, with the goal of getting the items ingested *quickly*, and then correct/augment metadata later?  
First option may be cleanest in terms of what goes into the script, and puts the items online with the best info.  
Second option, though, lets us get the DOIs and Proquest process underway *immediately*, the Proquest process being the most time-sensitive and often **late** in delivery part of the process...
---------------------
Input files should be a spreadsheet for theses, and a spreadsheet for dissertations (NOTE: is there a reason we really need two separate scripts? I believe the fields are/should be the same, we just process them from different directories...discuss). These files should be provided to the script with the following column names:  

"Advisor" (advisor name in the form Last, First MI)  
"Name" (author name in the form Last, First MI)  
"Abstract"  
"bibref" (?)  
"extent" (?)  
"Language" (format?)  
"Subject" (?)  
"Title" (sentence-case, ideally, though I can convert it in the script, but if it's already been taken care of, don't want to do it again -- proper nouns and the like may be lowercased and need to be fixed AGAIN, so clarify)  
"Discipline" ("department?")  
"Degree"  
"DegreeLevel"  
"Issued" (Year)
"Submitted" (year and semester, ie '2021 Summer' -- I think this is flipped around from how the spreadsheets comes from the Graduate School)


Complete the variables below to set options for input files and other variables:

In [118]:
directory = 'T://Projects//ETD//2021Summer'
diss_file = 'T://Projects//ETD//2021Summer//Summer21-ETD-DISS-openrefine-xlsx.csv' # input file name
thes_file = 'T://Projects//ETD//2021Summer//Summer21-ETD-THES-openrefine-xlsx.csv' # input file name
jmp_file = ''

term = '2021 Summer' #format YYYY <semester>, ie '2021 Summer' or '2022 Spring'
issued_year = '2021'

doi_batch_name = 'MU-DOI-2021-11-18'
doi_timestamp = '20211118010101'

In [97]:
#diss_data = pd.read_excel(GS_file, sheet_name='Dissertations') # change/remove sheet name if necessary
#thes_data = pd.read_excel(GS_file, sheet_name='Theses') # change/remove sheet name if necessary
diss_data = pd.read_csv(diss_file, encoding='utf-8')
thes_data = pd.read_csv(thes_file, encoding='utf-8')


os.getcwd()

'T:\\Projects\\ETD\\2021Summer'

In [98]:
os.chdir(directory)
os.getcwd()

'T:\\Projects\\ETD\\2021Summer'

In [99]:
thes_data

Unnamed: 0,Column,Mizzou ID,Advisor,Last Name,Name,First Name,Middle Name,MI,Date of Birth,Semester of Completion,Department,Degree,DegreeLevel,Hold 1 Year,Immediate Release,Title,Hold 1 year,Abstract
0,x,14300000.0,"Bailey, Eric",Allen,"Allen, Hannah L.",Hannah,Louise,L.,1998-07-06T05:00:00Z,Summer 2021,Animal Sciences,M.S.,Masters,,X,"EFFECTS OF ESSENTIAL OILS ON THE PERFORMANCE, ...",,Concerns and changes in policy related to use ...
1,x,14300000.0,"Thomas, Jordan",Andersen,"Andersen, Carson M.",Carson,McKenna,M.,1996-05-01T05:00:00Z,Summer 2021,Animal Sciences,M.S.,Masters,,X,EVALUATION OF THE 7 & 7 SYNCH PROTOCOL FOR CON...,,Experiment 1 was conducted to compare 7 &amp; ...
2,x,14300000.0,,Baert,"Baert, Nicholas W.",Nicholas,Wallace,W.,1995-08-18T05:00:00Z,Summer 2021,Plant Insect and Microbial Sciences,M.S.,Masters,,,GENOTYPIC VARIATION IN MAIZE NODAL ROOT GROWTH...,X,
3,x,10300000.0,,Bernardin,"Bernardin, Courtney J.",Courtney,Jorgenson,J.,1990-05-10T05:00:00Z,Summer 2021,Psychology,M.A.,Masters,,,SOCIAL CAMOUFLAGING AND MENTAL HEALTH IN ADOLE...,X,
4,x,14300000.0,,Brown,"Brown, Sarah A.",Sarah,Anne,A.,1997-04-09T05:00:00Z,Summer 2021,Human Dimensions of Natural Resources/Natural ...,M.S.,Masters,,,PERCEPTIONS OF MISSOURI LANDOWNERS WITH CONSER...,X,
5,x,14300000.0,"Lupo, Anthony",Cain,"Cain, Thomas",Thomas,,,2021-06-30T05:00:00Z,Summer 2021,"Soil, Environmental and Atmospheric Sciences/N...",M.S.,Masters,,X,THE VARIABILITY OF THE EL NIÑO SOUTHERN OSCILL...,,The El Niño Southern Oscillation (ENSO) has lo...
6,x,14400000.0,,Cheng,"Cheng, Jiayue",Jiayue,,,1998-04-18T05:00:00Z,Summer 2021,Biological Engineering,M.S.,Masters,,,Fermentative production of Xylitol from Hemice...,X,
7,x,10300000.0,"Donnelly, Lindsay",Couto,"Couto, Jason I.",Jason,Ian,I.,1987-08-21T05:00:00Z,Summer 2021,Biomedical Sciences/Veterinary Medicine and Su...,M.S.,Masters,,X,PROSPECTIVE EVALUATION OF THE FECAL MICROBIOME...,,The fecal microbiome composition has been asso...
8,x,14300000.0,,Fei,"Fei, Qihui",Qihui,,,1996-10-04T05:00:00Z,Summer 2021,Chemical Engineering,M.S.,Masters,,,Development of multiscale porous elastomer sub...,X,
9,x,18100000.0,,Flowers,"Flowers, Kali N.",Kali,Nicole,N.,1995-09-26T05:00:00Z,Summer 2021,Applied Behavior Analysis,M.S.,Masters,,,EVALUATION OF DIFFERENT ARRANGEMENTS OF MULTIP...,X,


In [41]:
# Need to remove certain characters from item text to prevent errors in batch XML import.
# So we create a dictionary to map the special character to the escaped version (or other change to problematic characters)
# then create a translation table with str.maketrans. This will then be used in the 'build_xml' function to translate each field
# as it is inserted into the XML.
trans_dict = {
    '&': '&amp;',
    '\'': '&apos;',
    '\"': '&quot;',
    '<': '&lt;',
    '>': '&gt;',
    '%': ' percent',
    '°': ' degrees',
    '≥': '[greater than or equal to]',
    '≤': '[less than or equal to]',
    '©': '[copyright]',
    '™': '[trademark]',
    '—': '--',
    'α': '[alpha]',
    'β': '[beta]',
    'μ': '&#181;',
    '×': 'x',
    '±': '[plus or minus]',
    '~': '&#126;',
    '♭': '[flat]',
    '’': "'"
}

trans_table = str.maketrans(trans_dict)

# DISSERTATIONS

In [None]:
# Iterate through rows in dataframe (each ETD) and generate XML for each, completing the template defined below as a format string (variables are inserted in {}).
# .translate(trans_table) function uses the table above to make character substitutions necessary for XML parsing and Dspace import
# .capitalize() function converts title to sentence-case
# Creates dublin_core.xml and metadata_thesis.xml, and contents files for the batch file.

for index, row in diss_data.iterrows():
    if pd.isna(row['Immediate Release']) or pd.isna(row['Advisor']):          # for now, skip holds or ones with missing data
        continue
            
    print(row['Title'])
    
    dc_xml = f'''<?xml version="1.0" encoding="UTF-8"?>
<dublin_core>
 <dcvalue element="title" qualifier="none" language="eng">{row['Title'].translate(trans_table).capitalize()}</dcvalue>
 <dcvalue element="contributor" qualifier="author" language="eng">{row['Name'].translate(trans_table)}</dcvalue>
 <dcvalue element="contributor" qualifier="advisor" language="eng">{row['Advisor'].translate(trans_table)}</dcvalue>
 <dcvalue element="date" qualifier="submitted" language="eng">{term}</dcvalue>
 <dcvalue element="description" qualifier="abstract" language="eng">{row['Abstract'].translate(trans_table)}</dcvalue>

 <dcvalue element="date" qualifier="issued" language="eng">{issued_year}</dcvalue> 
 <dcvalue element="language" language="eng">English</dcvalue>
 <dcvalue element="language" qualifier="iso" language="eng">eng</dcvalue>
 <dcvalue element="publisher" qualifier="none" language="eng">University of Missouri--Columbia</dcvalue>
 <dcvalue element="type" qualifier="none" language="eng">Thesis</dcvalue>

</dublin_core>'''
    print(dc_xml)
    
    thesis_xml = f'''<?xml version="1.0" encoding="UTF-8"?>
<dublin_core schema="thesis">

 <dcvalue element="degree" qualifier="discipline" language="eng">{row['Department'].translate(trans_table)}</dcvalue>
 <dcvalue element="degree" qualifier="name" language="eng">{row['Degree']}</dcvalue>
 <dcvalue element="degree" qualifier="level" language="eng">{row['DegreeLevel']}</dcvalue>

</dublin_core>'''
    print(thesis_xml)
    
    directory_name = directory+'//Dissertations//mospace-batchfiles//'+row['Last Name']+row['First Name']+'-batchfiles//'
    
    print(directory_name)
    os.makedirs(directory_name)
    with open(directory_name+'dublin_core.xml', 'w') as xml_file:
        xml_file.write(dc_xml)
    with open(directory_name+'metadata_thesis.xml', 'w') as xml_file:
        xml_file.write(thesis_xml)
    with open(directory_name+'contents', 'w') as contents_file:
        contents_file.write(row['Last Name']+row['First Name']+'.pdf')

In [107]:
os.getcwd()

'T:\\Projects\\ETD\\2021Summer'

## THESES

In [None]:
# Iterate through rows in dataframe (each ETD) and generate XML for each, completing the template defined below as a format string (variables are inserted in {}).
# .translate(trans_table) function uses the table above to make character substitutions necessary for XML parsing and Dspace import
# .capitalize() function converts title to sentence-case
# .replace() function on title replaced mid-title newline character (title broken over two lines) with space
# Creates dublin_core.xml and metadata_thesis.xml, and contents files for the batch file.

newline_char = '\n' # Needed to add this because a multiline title was found, so .replace(newline_char, ' ') replaces the line break with whitespace (makes title a single line)
from shutil import copy # To copy files to the right place...(describe more)

for index, row in thes_data.iterrows():
    if pd.isna(row['Immediate Release']) or pd.isna(row['Advisor']):          # for now, skip holds or ones with missing data
        continue                                                              # THOUGH, we should change this practice and maybe update the file copy section to separate the files based on their
                                                                              # spreadsheet entry, that way we could go ahead and have the files staged, ready for upload when the hold expires.
                                                                              # Either that, or go ahead and upload them and apply an embargo policy (is there a Dspace XML statement for embargoes?)
            
    print(row['Name'], row['Title'])
    
    dc_xml = f'''<?xml version="1.0" encoding="UTF-8"?>
<dublin_core>
 <dcvalue element="title" qualifier="none" language="eng">{row['Title'].translate(trans_table).capitalize().replace(newline_char,' ')}</dcvalue>
 <dcvalue element="contributor" qualifier="author" language="eng">{row['Name'].translate(trans_table)}</dcvalue>
 <dcvalue element="contributor" qualifier="advisor" language="eng">{row['Advisor'].translate(trans_table)}</dcvalue>
 <dcvalue element="date" qualifier="submitted" language="eng">{term}</dcvalue>
 <dcvalue element="description" qualifier="abstract" language="eng">{row['Abstract'].translate(trans_table)}</dcvalue>

 <dcvalue element="date" qualifier="issued" language="eng">{issued_year}</dcvalue> 
 <dcvalue element="language" language="eng">English</dcvalue>
 <dcvalue element="language" qualifier="iso" language="eng">eng</dcvalue>
 <dcvalue element="publisher" qualifier="none" language="eng">University of Missouri--Columbia</dcvalue>
 <dcvalue element="type" qualifier="none" language="eng">Thesis</dcvalue>

</dublin_core>'''
    # NOTE: above, row['Title'].capitalize() converts the string to **sentence case**
    
    #print(dc_xml)
    
    thesis_xml = f'''<?xml version="1.0" encoding="UTF-8"?>
<dublin_core schema="thesis">

 <dcvalue element="degree" qualifier="discipline" language="eng">{row['Department'].translate(trans_table)}</dcvalue>
 <dcvalue element="degree" qualifier="name" language="eng">{row['Degree']}</dcvalue>
 <dcvalue element="degree" qualifier="level" language="eng">{row['DegreeLevel']}</dcvalue>

</dublin_core>'''
    #print(thesis_xml)
    
    directory_name = directory+'//Theses//mospace-batchfiles//'+row['Last Name']+row['First Name']+'-batchfiles//'

    print(directory_name)

    os.makedirs(directory_name)
    with open(directory_name+'dublin_core.xml', 'w') as xml_file:
        xml_file.write(dc_xml)
    with open(directory_name+'metadata_thesis.xml', 'w') as xml_file:
        xml_file.write(thesis_xml)
    with open(directory_name+'contents', 'w') as contents_file:
        contents_file.write(row['Last Name']+row['First Name']+'Research.pdf')
    
    try:
        copy(directory+'//Theses//'+row['Last Name']+row['First Name']+'Research.pdf', directory_name)
    except Exception as e:
        print('FILE COPY ERROR: '+directory+'//Theses//'+row['Last Name']+row['First Name']+'Research.pdf -- ')
        print(e)

# Journalism Masters' Projects

In [43]:
# Iterate through rows in dataframe (each JMP) and generate XML for each, completing the template defined below as a format string (variables are inserted in {}).
# .translate(trans_table) function uses the table above to make character substitutions necessary for XML parsing and Dspace import
# .capitalize() function converts title to sentence-case
# .replace() function on title replaced mid-title newline character (title broken over two lines) with space
# Creates dublin_core.xml and metadata_thesis.xml, and contents files for the batch file.

directory = 'T:\Projects\JMP\Journalism Master ProjectsFall2021\Immediate Release Worldwide Access'
jmp_file = 'T:\Projects\JMP\Journalism Master ProjectsFall2021\\2021-Fall-JMPs.csv'
jmp_data = pd.read_csv(jmp_file, encoding='utf-8')
term = '2021 Fall' #format YYYY <semester>, ie '2021 Summer' or '2022 Spring'
issued_year = '2021'

In [44]:
jmp_data

Unnamed: 0,Name,Department,Term,Title,Abstract,Unnamed: 5,Unnamed: 6,Advisor,Advisor 2,www,Unnamed: 10,Unnamed: 11,hold,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18
0,"Zhao, Yiheng",Journalism,2021 Fall,Public-powered journalism in local news report...,This project aims to help the local news lands...,,,Jim Flink,,X,,,,,X,X,X,X,none
1,"Vaca, Marilyn",Journalism,2021 Fall,Speed and accuracy: how a medium market televi...,News of emergency use COVID-19 vaccines domina...,,,Lynda Kraxberger,,X,,,,,X,X,X,X,none
2,"Carpenter, Sarah",Journalism,2021 Fall,What training and resources would help journal...,There is no question that covering traumatic e...,,,Stacey Woelfel,,,,,X (Then Worldwide Access),,X,X,X,X,
3,"McManus, Camille",Journalism,2021 Fall,Promoting public health: vaccine communication...,After nearly two years of the COVID-19 pandemi...,,,John Stemmle,,X,,,,,X,X,X,X,1 file
4,"Lucas, Emmy",Journalism,2021 Fall,The moneymakers: business publications and alt...,The media industry has seen a growing consolid...,,,Heather Isherwood,,X,,,,,x,x,x,x,none
5,"Kurpius, John",Journalism,2021 Fall,"Exploring culture: structure, agents, and dive...",The US advertising and public relations indust...,,,John Kurpius,Holly Higginbotham,X,,,,,X,X,X,X,


In [45]:
jmp_data[['last_name', 'first_name']] = jmp_data['Name'].str.split(', ', expand=True)
jmp_data[['adv_first_name', 'adv_last_name']] = jmp_data['Advisor'].str.split(' ', expand=True)

In [37]:
jmp_data

Unnamed: 0,Name,Department,Term,Title,Abstract,Unnamed: 5,Unnamed: 6,Advisor,Advisor 2,www,...,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,last_name,first_name,adv_first_name,adv_last_name
0,"Zhao, Yiheng",Journalism,2021 Fall,Public-Powered Journalism in Local News Report...,,,,Jim Flink,,X,...,,X,X,X,X,none,Zhao,Yiheng,Jim,Flink
1,"Vaca, Marilyn",Journalism,2021 Fall,Speed and Accurracy: How a Medium Market Telev...,,,,Lynda Kraxberger,,X,...,,X,X,X,X,none,Vaca,Marilyn,Lynda,Kraxberger
2,"Carpenter, Sarah",Journalism,2021 Fall,What training and resources would help journal...,,,,Stacey Woelfel,,,...,,X,X,X,X,,Carpenter,Sarah,Stacey,Woelfel
3,"McManus, Camille",Journalism,2021 Fall,Promoting Public Health: Vaccine Communication...,,,,John Stemmle,,X,...,,X,X,X,X,1 file,McManus,Camille,John,Stemmle
4,"Lucas, Emmy",Journalism,2021 Fall,The Moneymakers: Business Publications and Alt...,,,,Heather Isherwood,,X,...,,x,x,x,x,none,Lucas,Emmy,Heather,Isherwood
5,"Kurpius, John",Journalism,2021 Fall,"Exploring Culture: Structure, Agents, and Dive...",,,,John Kurpius,Holly Higginbotham,X,...,,X,X,X,X,,Kurpius,John,John,Kurpius


In [38]:
os.chdir(directory)
os.getcwd()
for dir_name in os.listdir():
    print(dir_name)
    print(os.listdir('./'+dir_name))
    print(jmp_data[jmp_data['last_name'] == dir_name.split(',')[0]]['last_name'])

Kurpius, John
['abstract.pdf', 'analysis.pdf', 'Electronic Release Form.pdf', 'keywords.pdf', 'projectreport.pdf']
5    Kurpius
Name: last_name, dtype: object
Lucas, Emmy
['abstract.pdf', 'analysis.pdf', 'Keywords.pdf', 'Lucas_Electronic Release Form.pdf', 'projectreport.pdf']
4    Lucas
Name: last_name, dtype: object
McManus, Camille
['McManus_abstract.pdf', 'McManus_analysis.pdf', 'McManus_keywords.pdf', 'McManus_ProjectReleaseForm.pdf', 'McManus_projectreport.pdf', 'Media Folder']
3    McManus
Name: last_name, dtype: object
Vaca, Marilyn
['Abstract.pdf', 'Analysis.pdf', 'ElectronicReleaseForm.pdf', 'keywords.pdf', 'Projectreport.pdf']
1    Vaca
Name: last_name, dtype: object
Zhao, Yiheng
['abstract.pdf', 'analysis.pdf', 'keywords.pdf', 'projectreport.pdf', 'proposal.pdf', 'ReleaseForm Y.Zhao.pdf']
0    Zhao
Name: last_name, dtype: object


In [29]:
jmp_data[jmp_data['last_name'] == 'Kurpius']

Unnamed: 0,Name,Degree,Term,Title,Unnamed: 4,Unnamed: 5,Unnamed: 6,Advisor,Advisor 2,www,...,Unnamed: 11,hold,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,last_name,first_name
5,"Kurpius, John",Journalism,2021 Fall,"Exploring Culture: Structure, Agents, and Dive...",,,,John Kurpius,Holly Higginbotham,X,...,,,,X,X,X,X,,Kurpius,John


In [58]:
newline_char = '\n' # Needed to add this because a multiline title was found, so .replace(newline_char, ' ') replaces the line break with whitespace (makes title a single line)
from shutil import copy # To copy files to the right place...(describe more)

for index, row in jmp_data.iterrows():
#    if pd.isna(row['Immediate Release']) or pd.isna(row['Advisor']):          # for now, skip holds or ones with missing data
#        continue                                                              # THOUGH, we should change this practice and maybe update the file copy section to separate the files based on their
                                                                              # spreadsheet entry, that way we could go ahead and have the files staged, ready for upload when the hold expires.
                                                                              # Either that, or go ahead and upload them and apply an embargo policy (is there a Dspace XML statement for embargoes?)
            
    print(row['Name'], row['Title'])
    
    dc_xml = f'''<?xml version="1.0" encoding="UTF-8"?>
<dublin_core>
 <dcvalue element="title" qualifier="none" language="eng">{row['Title'].translate(trans_table).capitalize().replace(newline_char,' ')}</dcvalue>
 <dcvalue element="contributor" qualifier="author" language="eng">{row['Name'].translate(trans_table)}</dcvalue>
 <dcvalue element="contributor" qualifier="advisor" language="eng">{row['adv_last_name'].translate(trans_table)}, {row['adv_first_name'].translate(trans_table)}</dcvalue>
 <dcvalue element="date" qualifier="submitted" language="eng">{term}</dcvalue>
 <dcvalue element="description" qualifier="abstract" language="eng">{row['Abstract'].translate(trans_table)}</dcvalue>

 <dcvalue element="date" qualifier="issued" language="eng">{issued_year}</dcvalue> 
 <dcvalue element="language" language="eng">English</dcvalue>
 <dcvalue element="language" qualifier="iso" language="eng">eng</dcvalue>
 <dcvalue element="publisher" qualifier="none" language="eng">University of Missouri--Columbia</dcvalue>
 <dcvalue element="type" qualifier="none" language="eng">Project</dcvalue>

</dublin_core>'''
    # NOTE: above, row['Title'].capitalize() converts the string to **sentence case**
    
    #print(dc_xml)
    
    thesis_xml = f'''<?xml version="1.0" encoding="UTF-8"?>
<dublin_core schema="thesis">

 <dcvalue element="degree" qualifier="discipline" language="eng">{row['Department'].translate(trans_table)}</dcvalue>
 <dcvalue element="degree" qualifier="name" language="eng">M.A.</dcvalue>
 <dcvalue element="degree" qualifier="level" language="eng">Masters</dcvalue>

</dublin_core>'''
    #print(thesis_xml)
    
    directory_name = directory+'//mospace-batchfiles//'+row['last_name']+row['first_name']+'-batchfiles//'

    print(directory_name)

    os.makedirs(directory_name)
    with open(directory_name+'dublin_core.xml', 'w') as xml_file:
        xml_file.write(dc_xml)
    with open(directory_name+'metadata_thesis.xml', 'w') as xml_file:
        xml_file.write(thesis_xml)
    with open(directory_name+'contents', 'w') as contents_file:
        for file in os.listdir(directory+'\\'+row['Name']):
            print(file)
            contents_file.write(row['last_name']+row['first_name']+file.capitalize()+'\n')
            try:
                copy(directory+'\\'+row['Name']+'\\'+file, directory_name+'\\'+row['last_name']+row['first_name']+file.capitalize())
            except Exception as e:
                print("FILE COPY ERROR: '+directory+row['Name']+file -- ")
                print(e)

Zhao, Yiheng Public-powered journalism in local news reporting: public opinion research for local newsroom
T:\Projects\JMP\Journalism Master ProjectsFall2021\Immediate Release Worldwide Access//mospace-batchfiles//ZhaoYiheng-batchfiles//
abstract.pdf
analysis.pdf
keywords.pdf
projectreport.pdf
proposal.pdf
Vaca, Marilyn Speed and accuracy: how a medium market television news operation fact-checked information on covid-19 vaccine. 
T:\Projects\JMP\Journalism Master ProjectsFall2021\Immediate Release Worldwide Access//mospace-batchfiles//VacaMarilyn-batchfiles//
Abstract.pdf
Analysis.pdf
keywords.pdf
Projectreport.pdf
Carpenter, Sarah What training and resources would help journalists covering traumatic events?
T:\Projects\JMP\Journalism Master ProjectsFall2021\Immediate Release Worldwide Access//mospace-batchfiles//CarpenterSarah-batchfiles//
abstract.pdf
analysis.pdf
keywords.pdf
projectreport.pdf
McManus, Camille Promoting public health: vaccine communication efforts across rural Miss

## Crossref DOIs
## Creates DOI batch metadata for Crossref

In [33]:
# CROSSREF DOI XML
# THIS NEEDS TO BE DONE AFTER THEY'RE LOADED IN MOSPACE (need handle assignment)
# Will need input data with :
# First name, Last name, Advisor First name, Advisor Last Name, Title, Abstract, Month, Year, Degree, Department, Handle URL, DOI, ORCID?
#            Handle URL is generated by MOspace deposit and included in MOspace spreadsheet. DOI field can be derived from handle, DOI will use the last digit group from the Handle (10.32469/10355/<handleID>)
# Steps:
#     Export MOspace spreadsheet(s), filter rows to applicable ETDs. May combine theses and dissertations...
#     Openrefine:
#         Split names
#         Generate DOI string
#         Add month column?
# XML based on template at: https://gitlab.com/crossref/schema/-/blob/master/best-practice-examples/dissertation.5.3.0.xml

data_file = 'C:\\Users\\pryors\\Downloads\\10355-85734-csv.csv'
etd_data = pd.read_csv(data_file, encoding='utf-8')

doi_batch_name = 'MU-DOI-2021-12-06'
doi_timestamp = '20211206010101'

In [36]:
crossref_head = f'''<?xml version="1.0" encoding="UTF-8"?>
<doi_batch xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://www.crossref.org/schema/5.3.0 https://www.crossref.org/schemas/crossref5.3.0.xsd"
  xmlns="http://www.crossref.org/schema/5.3.0" xmlns:jats="http://www.ncbi.nlm.nih.gov/JATS1"
  xmlns:fr="http://www.crossref.org/fundref.xsd" xmlns:mml="http://www.w3.org/1998/Math/MathML"
  version="5.3.0">
  <head>
    <doi_batch_id>{doi_batch_name}</doi_batch_id>
    <timestamp>{doi_timestamp}</timestamp>
    <depositor>
      <depositor_name>University of Missouri</depositor_name>
      <email_address>mospace@missouri.edu</email_address>
    </depositor>
    <registrant>University of Missouri</registrant>
  </head>
  
  <body>
'''

crossref_end = '''
</body>
</doi_batch>'''

def build_diss_xml(row_data):
    dissertation_xml = f'''<dissertation language="en" publication_type="full_text">
    <contributors>
     <person_name contributor_role="author" sequence="first">
      <given_name>{row_data['First Name']}</given_name>
      <surname>{row_data['Last Name']}</surname>
     </person_name>
     <person_name contributor_role="chair" sequence="additional">
      <given_name>{row_data['Advisor First Name']}</given_name>
      <surname>{row_data['Advisor Last Name']}</surname>
     </person_name>
   </contributors>
   <titles>
     <title>{row_data['dc.title[eng]']}</title>
   </titles>
   <jats:abstract>
    <jats:p xml:lang="en">{row_data['dc.description.abstract[eng]'].translate(trans_table)}</jats:p>
   </jats:abstract>
   <approval_date>
    <month>{row_data['Month']}</month>
    <year>{row_data['dc.date.issued[eng]']}</year>
   </approval_date>
   <institution>
    <institution_name>University of Missouri--Columbia</institution_name>
    <institution_id type="ror">https://ror.org/02ymw8z06</institution_id>
    <institution_id type="isni">https://isni.org/isni/0000000121623504</institution_id>
    <institution_id type="wikidata">https://www.wikidata.org/wiki/Q579968</institution_id>
    <institution_department>{row_data['thesis.degree.discipline[eng]']}</institution_department>
   </institution>
   <degree>{row_data['thesis.degree.name[eng]']}</degree>
   <doi_data>
    <doi>{row_data['DOI']}</doi>
    <resource>{row_data['dc.identifier.uri']}</resource>
   </doi_data>
   </dissertation>
   '''

    return dissertation_xml

xml = []

xml.append(crossref_head)
    
for index, row in etd_data.iterrows():
    xml.append(build_diss_xml(row))

xml.append(crossref_end)
    
xml_string = ''.join(xml)

with open(doi_batch_name+'.xml', 'w') as xml_file:
    xml_file.write(xml_string)

In [None]:
xml_string