# Project Psyched: A Closer Look Into Reproducibility In Psychological Research

## Full Text Mining Script: Part 2 - Test Statistics
This script is set up for ProQuest TDM Studio's corpus of Psychology texts.

Author: Yuyang Zhong (2020). This work is licensed under a [Creative Commons BY-NC-SA 4.0 International
License][cc-by].

![CC BY-NC-SA 4.0][cc-by-shield]

[cc-by]: http://creativecommons.org/licenses/by/4.0/
[cc-by-shield]: https://img.shields.io/badge/license-CC--BY--NC--SA%204.0-blue

#### Setup & Imports

In [1]:
import os
import json
import xml.etree.ElementTree as ET
import re

In [2]:
in_path = "../articles/samples/"
out_path = "../data/samples/"

In [3]:
os.listdir(in_path)

['1554207703.txt', '1011297999.txt', '1509629602.txt']

In [4]:
def capture(in_path, out_path, out_name):
    """
    An encapsulating function to scrape metadata, research statistics,
    and reproducibility practices.
    
    @parameter in_path: the path of the input files
    @parameter out_path: the path to output file
    @parameter out_name: name for output file
    
    """
    
    # Make sure our parameters are strings
    assert(type(in_path) == str)
    assert(type(out_path) == str)
    assert(type(out_name) == str)
    
    # Loop through the in_path directory for files
    data={}
    for f in os.listdir(in_path):
               
        ##### Set up XML parser & JSON output file #####
        data[f] = {}
        root = ET.parse(in_path+f).getroot()
        
        ##### Capture Test Statistics #####
        try:
            raw_text = root.find('TextInfo').find('PreformattedData').find('PsycArticles').text
        except AttributeError:
            raw_text = ''
        
        ## strip HTML tags
        raw_text = re.sub(r'<[^>]*>', '', raw_text)
        
        ## remove nextline character
        raw_text = re.sub(r'\n\s*', '   ', raw_text)
        
        ## Replace HTML named symbols 
        html_symbols = {
            "&": r'&amp;',
            '"': r'&quot;',
            "'": r'&apos;',
            ">": r'&gt;',
            "<": r'&lt;',
        }
        for i in iter(html_symbols):
            raw_text = re.sub(html_symbols[i], i, raw_text)
           
        ## F Statistics ##
        data[f]['F_stats'] = re.findall(r'Fs?\s*\(\s*\d+\s*\,\s*\d+\s*\)\s*[\<|\>|\=]\s*\d*\.?\d*\s*\,\s*p\s*[\<|\>|\=]\s*\d*\.\d+', 
                                        raw_text)
        data[f]['F_stats_ns'] = re.findall(r'Fs?\s*\(\s*\d+\s*\,\s*\d+\s*\)\s*[\<|\>|\=]\s*\d*\.?\d*\s*\,\s*n\.?s\.?', 
                                           raw_text)
        
        data[f]['t_scores'] = re.findall(r't\s*\(\s*\d*\s*,?\s*\d+\s*\)\s*[\<|\>|\=]\s*[\−|\-]?\s*\d*\.?\d*\s*,\s*p\s*[\<|\>|\=]\s*\d?\.\d+',
                                         raw_text)
        data[f]['t_scores_ns'] = re.findall(r't\s*\(\s*\d*\s*,?\s*\d+\s*\)\s*[\<|\>|\=]\s*[\−|\-]?\s*\d*\.?\d*\s*,\s*n\.?s\.?',
                                            raw_text)
                
        
    ##### Write Output #####
    with open(out_name + '.json', 'w') as outfile:
        json.dump(data, outfile, indent=2)      

In [5]:
capture(in_path, out_path, 'samples')