# Project Psyched: A Closer Look Into Reproducibility In Psychological Research

## Full Text Mining Script: Part 1 - Metadata + P-values
This script is set up for ProQuest TDM Studio's corpus of Psychology texts.

Author: Yuyang Zhong (2020). This work is licensed under a [Creative Commons BY-NC-SA 4.0 International
License][cc-by].

![CC BY-NC-SA 4.0][cc-by-shield]

[cc-by]: http://creativecommons.org/licenses/by/4.0/
[cc-by-shield]: https://img.shields.io/badge/license-CC--BY--NC--SA%204.0-blue

#### Setup & Imports

In [1]:
import os
import json
import xml.etree.ElementTree as ET
import re

In [2]:
in_path = "../articles/samples/"
out_path = "../data/samples/"

In [3]:
os.listdir(in_path)

['1554207703.txt', '1011297999.txt', '1509629602.txt']

In [10]:
def capture(in_path, out_path, out_name):
    """
    An encapsulating function to scrape metadata, research statistics,
    and reproducibility practices.
    
    @parameter in_path: the path of the input files
    @parameter out_path: the path to output file
    @parameter out_name: name for output file
    
    """
    
    # Make sure our parameters are strings
    assert(type(in_path) == str)
    assert(type(out_path) == str)
    assert(type(out_name) == str)
    
    # Loop through the in_path directory for files
    data={}
    for f in os.listdir(in_path):
               
        ##### Set up XML parser & JSON output file #####
        data[f] = {}
        root = ET.parse(in_path+f).getroot()
        
        
        ##### Capture Metadata #####
        ## Title
        data[f]['Title'] = root.find('Obj').find('TitleAtt').find('Title').text
        
        ## Date Published
        data[f]['Date Published'] = root.find('Obj').find('NumericDate').text
        
        ## Peer Reviewed
        pr = root.find('Obj').find('PeerReviewed').text
        'Yes' if pr == 'true' else 'No'
        data[f]['Peer Review'] = pr
        
        ## DOI
        doi = ''
        for i in root.find('Obj').find('ObjectIDs').iter('ObjectID'):
            curr = i.find('DOI')
            if curr != None:
                doi = curr.text
        data[f]['DOI'] = doi
        
        ## Author List
        authors=[]
        for i in root.find('Obj').find('Contributors').iter('Contributor'):
            author = i.find('Author')
            if author != None:
                authors.append(author.find('NormalizedDisplayForm').text)
        data[f]['Author'] = authors
        
        ## Keywords & Identifiers
        identifiers=[]
        for i in root.find('Obj').find('Terms').iter('FlexTerm'):
            iden = i.find('Identifiers')
            if iden != None:
                identifiers.append(iden.text)        
        data[f]['Keywords'] = identifiers
        
        ## Methodology
        methodology=[]
        for i in root.find('Obj').find('Terms').iter('FlexTerm'):
            method = i.find('Methodology')
            if method != None:
                methodology.append(method.text)        
        data[f]['Methodology'] = methodology
        
        ## Number of References
        try: 
            data[f]['References'] = root.find('Obj').find('DocFeatures').find('NumRefsAtt').find('NumRefs').text
        except AttributeError:
            data[f]['References'] = ''
        
        ## Journal
        data[f]['Journal'] = root.find('PublicationInfo').find('PublicationTitleAtt'
                                                                   ).find('PublicationTitle').text
        
        ## Volume
        try:
            data[f]['Volume'] = root.find('PublicationInfo').find('Volume').text
        except AttributeError:
            data[f]['Volume'] = ''
        
        ## Issue
        try:
            data[f]['Issue'] = root.find('PublicationInfo').find('Issue').text
        except AttributeError:
            data[f]['Issue'] = ''
        
        ## Pages
        try:
            data[f]['Pages'] = root.find('Obj').find('PrintLocation').find('Pagination').text
        except AttributeError:
            data[f]['Pages'] = ''
        
        
        ##### Capture P-Values #####
        try:
            raw_text = root.find('TextInfo').find('PreformattedData').find('PsycArticles').text
        except AttributeError:
            raw_text = ''
        
        ## strip HTML tags
        raw_text = re.sub(r'<[^>]*>', '', raw_text)
        
        ## remove nextline character
        raw_text = re.sub(r'\n\s*', '   ', raw_text)
        
        ## Replace HTML named symbols 
        html_symbols = {
            "&": r'&amp;',
            '"': r'&quot;',
            "'": r'&apos;',
            ">": r'&gt;',
            "<": r'&lt;',
        }
        for i in iter(html_symbols):
            raw_text = re.sub(html_symbols[i], i, raw_text)
        
        data[f]['P-Values'] = re.findall(r'p\s*([\<|\>|\=]\s*\d?\.\d+)', raw_text)
#         data[f]['Inequalities'] = re.findall(r'p\s*([\<|\>|\=])\s*\d*\.\d+', raw_text)
        
        
    ##### Write Output #####
    with open(out_name + '.json', 'w') as outfile:
        json.dump(data, outfile, indent=2)      

In [11]:
capture(in_path, out_path, 'samples')