# Project Psyched: A Closer Look Into Reproducibility In Psychological Research

## Data Cleaning and Transformation Script
This script for data cleaning and transformation after data has been scraped from TDM Studio.

Author: Yuyang Zhong (2020). This work is licensed under a [Creative Commons BY-NC-SA 4.0 International
License][cc-by].

![CC BY-NC-SA 4.0][cc-by-shield]

[cc-by]: http://creativecommons.org/licenses/by/4.0/
[cc-by-shield]: https://img.shields.io/badge/license-CC--BY--NC--SA%204.0-blue

#### Setup & Imports

In [1]:
import itertools
import numpy as np
import pandas as pd
import re
import ast

from scipy import stats

### Part 1 Data Transformations

In [2]:
in_path = "../data/raw/"
out_path = "../data/"
corpus1_path_meta_p = 'corpus1_metadata_pval.csv'

In [3]:
corpus1_pt1 = pd.read_csv(in_path + corpus1_path_meta_p, index_col=0)
corpus1_pt1.head()

Unnamed: 0,Title,Date Published,Peer Review,DOI,Author,Keywords,Methodology,References,Journal,Volume,Issue,Pages,P-Values,P-Values-SN
614337945.xml,Induced mood and preschoolers' behavior: Isola...,1987-03-01,True,10.1037/0022-3514.52.3.620,"['Ridgeway, Doreen', 'Waters, Everett']",['hedonic tone of induced mood & level of arou...,['Empirical Study'],27.0,Journal of Personality and Social Psychology,52.0,3.0,620-625,"['< .01', '< .01', '< .01', '< .01', '< .01', ...",[]
1647028895.xml,The equilibrium model of relationship maintena...,2015-01-01,True,10.1037/pspi0000004,"['Murray, Sandra L.', 'Holmes, John G.', 'Grif...","['relationships', 'risk', 'threat-mitigation',...","['Empirical Study', 'Longitudinal Study', 'Qua...",90.0,Journal of Personality and Social Psychology,108.0,1.0,93-113,"['< .00001', '= .27', '= .63', '= .13', '= .00...",[]
614404963.xml,The statistical analysis of data from small gr...,2002-07-01,True,10.1037/0022-3514.83.1.126,"['Kenny, David A.', 'Mannetti, Lucia', 'Pierro...","['statistical analysis', 'small group data', ""...",[],38.0,Journal of Personality and Social Psychology,83.0,1.0,126-137,"['= .0048', '= .382', '< .001', '< .001', '= ....",[]
614332724.xml,Personality differences predict health-risk be...,1997-11-01,True,10.1037/0022-3514.73.5.1052,"['Caspi, Avshalom', 'Begg, Dot', 'Dickson, Nig...","['personality traits, health risk behaviors, 2...","['Empirical Study', 'Longitudinal Study']",72.0,Journal of Personality and Social Psychology,73.0,5.0,1052-1063,"['= .05', '< .001', '< .001', '= .02', '< .001...",[]
614304222.xml,Victim and perpetrator accounts of interperson...,1990-11-01,True,10.1037/0022-3514.59.5.994,"['Baumeister, Roy F.', 'Stillwell, Arlene', 'W...","['autobiographical accounts of anger, college ...",['Empirical Study'],42.0,Journal of Personality and Social Psychology,59.0,5.0,994-1005,"['= .051', '< .02', '< .001', '< .01', '< .02'...",[]


#### Metadata only

In [4]:
corpus1_meta_only = corpus1_pt1.drop(['P-Values', 'P-Values-SN'], axis=1)

#### Exploding p-values

In [5]:
# Excluded all scientific notation based p-values
corpus1_pt1['P-Values-SN'].value_counts()

[]    6049
Name: P-Values-SN, dtype: int64

In [6]:
corpus1 = corpus1_pt1[['Date Published', 'P-Values']]

## evaluate as list from string
corpus1['P-Values'] = corpus1['P-Values'].apply(eval)

## exploding P-values
corpus1 = corpus1.explode('P-Values')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


#### Date

In [7]:
corpus1['Year'] = pd.to_datetime(corpus1['Date Published']).dt.year

## Dropping full date published
corpus1 = corpus1.drop('Date Published', axis=1)
corpus1.head()

Unnamed: 0,P-Values,Year
614337945.xml,< .01,1987
614337945.xml,< .01,1987
614337945.xml,< .01,1987
614337945.xml,< .01,1987
614337945.xml,< .01,1987


In [8]:
corpus1.shape

(193318, 2)

#### Export as stand alone file

In [9]:
corpus1.to_csv(out_path + 'pvals_corpus1.csv')

### Meta Data Concatenating

In [7]:
corpus2_path_meta = 'corpus2_metadata.csv'

corpus2_meta_only = pd.read_csv(in_path + corpus2_path_meta, index_col=0)
corpus2_meta_only = corpus2_meta_only.drop('Full Text', axis=1)
corpus2_meta_only.head()

Unnamed: 0,Title,Date Published,Peer Review,DOI,Author,Keywords,Methodology,References,Journal,Volume,Issue,Pages
2388986342.xml,The trainer matters: Cross-classified models o...,2020-04-13,True,10.1037/apl0000503,"['Glerum, David R.', 'Joseph, Dana L.', 'McKen...","['training evaluation', 'training effectivenes...","['Empirical Study', 'Quantitative Study']",126.0,Journal of Applied Psychology,,,
1266149906.xml,Psychology is still a problematic science and ...,2012-12-01,True,10.1037/a0030084,"['Teo, Thomas']","['bias', 'misconceptions', 'psychology', 'psyc...",[],9.0,American Psychologist,67.0,9.0,807-808
614504953.xml,Achieving a new dimension: Children integrate ...,2009-05-01,True,10.1037/a0014616,"['Ebersbach, Mirjam']","['information integration', 'volume', 'intuiti...","['Empirical Study', 'Quantitative Study']",37.0,Developmental Psychology,45.0,3.0,877-883
1620027748.xml,The impact of time at work and time off from w...,2015-05-01,True,10.1037/a0038067,"['Dai, Hengchen', 'Milkman, Katherine L.', 'Ho...","['fatigue', 'workplace compliance', 'job deman...","['Empirical Study', 'Longitudinal Study', 'Qua...",77.0,Journal of Applied Psychology,100.0,3.0,846-862
614340287.xml,Some aspects of the executive personality\n,1955-10-01,True,10.1037/h0045488,"['Miner, John B.', 'Culver, John E.']",['executive personality'],"['Empirical Study', 'Quantitative Study']",8.0,Journal of Applied Psychology,39.0,5.0,348-353


#### Appending metadata from corpus 1 & 2

In [8]:
all_metadata = corpus1_meta_only.append(corpus2_meta_only)
all_metadata.head()

Unnamed: 0,Title,Date Published,Peer Review,DOI,Author,Keywords,Methodology,References,Journal,Volume,Issue,Pages
614337945.xml,Induced mood and preschoolers' behavior: Isola...,1987-03-01,True,10.1037/0022-3514.52.3.620,"['Ridgeway, Doreen', 'Waters, Everett']",['hedonic tone of induced mood & level of arou...,['Empirical Study'],27.0,Journal of Personality and Social Psychology,52.0,3,620-625
1647028895.xml,The equilibrium model of relationship maintena...,2015-01-01,True,10.1037/pspi0000004,"['Murray, Sandra L.', 'Holmes, John G.', 'Grif...","['relationships', 'risk', 'threat-mitigation',...","['Empirical Study', 'Longitudinal Study', 'Qua...",90.0,Journal of Personality and Social Psychology,108.0,1,93-113
614404963.xml,The statistical analysis of data from small gr...,2002-07-01,True,10.1037/0022-3514.83.1.126,"['Kenny, David A.', 'Mannetti, Lucia', 'Pierro...","['statistical analysis', 'small group data', ""...",[],38.0,Journal of Personality and Social Psychology,83.0,1,126-137
614332724.xml,Personality differences predict health-risk be...,1997-11-01,True,10.1037/0022-3514.73.5.1052,"['Caspi, Avshalom', 'Begg, Dot', 'Dickson, Nig...","['personality traits, health risk behaviors, 2...","['Empirical Study', 'Longitudinal Study']",72.0,Journal of Personality and Social Psychology,73.0,5,1052-1063
614304222.xml,Victim and perpetrator accounts of interperson...,1990-11-01,True,10.1037/0022-3514.59.5.994,"['Baumeister, Roy F.', 'Stillwell, Arlene', 'W...","['autobiographical accounts of anger, college ...",['Empirical Study'],42.0,Journal of Personality and Social Psychology,59.0,5,994-1005


#### Exporting metadata file

In [9]:
corpus1_meta_only.to_csv(out_path + 'metadata_corpus1.csv')
all_metadata.to_csv(out_path + 'metadata_all.csv')

### Part 2 Data Transformation

In [13]:
corpus1_stat_path = 'corpus1_stat_full.csv'
corpus2_stat_path = 'corpus2_stat_full.csv'

corpus1_stat = pd.read_csv(in_path + corpus1_stat_path, index_col = 0)
corpus2_stat = pd.read_csv(in_path + corpus2_stat_path, index_col = 0)

In [14]:
corpus1_stat.head()

Unnamed: 0,F_stats,F_stats_SN,F_stats_ns,t_scores,t_scores_SN,t_scores_ns
614337945.xml,[],[],[],"['t (41) = 4.10, p < .01', 't (41) ...",[],[]
1647028895.xml,[],[],[],[],[],[]
614404963.xml,"['F(1, 6) = 0.89, p = .382', 'F(1, 63) = 17.02...",[],[],[],[],[]
614332724.xml,"['F(10, 776) = 6.84, p < .001', 'F(10, 722) ...",[],[],[],[],[]
614304222.xml,[],[],[],[],[],[]


In [15]:
corpus1_stat['F_stats_SN'].value_counts()

[]    6048
Name: F_stats_SN, dtype: int64

In [16]:
corpus1_stat['t_scores_SN'].value_counts()

[]    6048
Name: t_scores_SN, dtype: int64

In [17]:
corpus2_stat.head()

Unnamed: 0,F_stats,F_stats_SN,F_stats_ns,t_scores,t_scores_SN,t_scores_ns
2388986342.xml,[],[],[],[],[],[]
1266149906.xml,[],[],[],[],[],[]
614504953.xml,[],[],[],[],[],[]
1620027748.xml,[],[],[],[],[],[]
614323566.xml,[],[],[],[],[],[]


In [18]:
corpus2_stat['F_stats_SN'].value_counts()

[]                                                                                                                                              21040
['= 1.09e−03', '= 3.12e−05', '= 3.41e−03', '= 9.05e−15', '= 1.02e−06', '= 5.88e−03', '= 1.25e−02', '= 3.64e−02', '= 2.90e−03', '= 1.69e−03']        1
Name: F_stats_SN, dtype: int64

In [19]:
corpus2_stat['t_scores_SN'].value_counts()

[]                                          21039
['= 1.05e−03', '= 3.2e−03', '= 3.4e−03']        1
['= 2.2e-16', '= 2.2e-16']                      1
Name: t_scores_SN, dtype: int64

#### Given the small amount of p-values represented in scientific notations, they will be excluded from analysis.

In [20]:
corpus1_stat = corpus1_stat.drop(['F_stats_SN', 't_scores_SN'], axis=1)
corpus2_stat = corpus2_stat.drop(['F_stats_SN', 't_scores_SN'], axis=1)

#### Append all test statistics

In [21]:
stats_all = corpus1_stat.append(corpus2_stat)
stats_all.head()

Unnamed: 0,F_stats,F_stats_ns,t_scores,t_scores_ns
614337945.xml,[],[],"['t (41) = 4.10, p < .01', 't (41) ...",[]
1647028895.xml,[],[],[],[]
614404963.xml,"['F(1, 6) = 0.89, p = .382', 'F(1, 63) = 17.02...",[],[],[]
614332724.xml,"['F(10, 776) = 6.84, p < .001', 'F(10, 722) ...",[],[],[]
614304222.xml,[],[],[],[]


#### Transforming & capturing parameters and statistics

In [22]:
df_stats = pd.DataFrame(columns=['File', 'Original', 'Type', 'Sign', 'Reported p-value', 'Recalculated p-value'])

#### Helper functions

In [23]:
def extract_f(s):
    """
    Takes in a string of reported F statistics with p-value and extract the numeric parameters.
    
    Returns: df1, df2, x, p
    """
    extract = re.findall(
        r'Fs?\s*\(\s*(\d+)\s*\,\s*(\d+)\s*\)\s*[\<|\>|\=]\s*(\d*\.?\d*)\s*\,\s*p\s*([\<|\>|\=])\s*(\d*\.\d+)',
        s)[0]    
    
    df1 = float(extract[0])
    df2 = float(extract[1])
    
    x = float(extract[2])
    ineq = extract[3]
    p = float(extract[4])
    
    return df1, df2, x, ineq, p

In [24]:
def extract_t(s):
    """
    Takes in a string of reported t score with p-value and extract the numeric parameters.
    
    Returns: df, x, p
    """
    
    s = re.sub(r'−\s*', "-", s)
    extract = re.findall(
        r't\s*\((\s*\d*\s*,)?\s*(\d+)\s*\)\s*[\<|\>|\=]\s*([\−|\-]?\s*\d*\.?\d*)\s*,\s*p\s*([\<|\>|\=])\s*(\d?\.\d+)',
        s)[0]
    
    df = float(extract[1])
    x = float(extract[2].replace(" ", ""))
    ineq = extract[3]
    p = float(extract[4])
    
    return df, x, ineq, p

In [25]:
def extract_f_compute_add(test_type, index, s):
    ex = extract_f(s)
    return {'File': index, 
            'Original': s, 
            'Type': test_type,
            'Sign': ex[3],
            'Reported p-value': ex[4], 
            'Recalculated p-value': 1-stats.f.cdf(ex[2], ex[0], ex[1])}

In [26]:
def extract_t_compute_add(test_type, index, s):
    ex = extract_t(s)
    return {'File': index, 
            'Original': s, 
            'Type': test_type, 
            'Sign': ex[2],
            'Reported p-value': ex[3], 
            'Recalculated p-value': 1-stats.t.cdf(ex[1], ex[0])}

#### Evaluate Lists

In [27]:
for col in stats_all.columns:
    stats_all[col] = stats_all[col].apply(ast.literal_eval)

#### Script to iterate through all statistics

In [28]:
for index, row in stats_all.iterrows():
    
    if len(row['F_stats']) > 0:
        for s in row['F_stats']:
            try:
                extract = extract_f_compute_add('f', index, s)
            except:
                pass
            else:
                df_stats = df_stats.append(extract, ignore_index=True)
                        
    if len(row['t_scores']) > 0:
        for s in row['t_scores']:
            try:
                extract = extract_t_compute_add('t', index, s)
            except:
                pass
            else:
                df_stats = df_stats.append(extract, ignore_index=True)

In [29]:
df_stats.head()

Unnamed: 0,File,Original,Type,Sign,Reported p-value,Recalculated p-value
0,614337945.xml,"t (41) = 4.10, p < .01",t,<,0.01,9.531027e-05
1,614337945.xml,"t (41) = −3.56, p < .01",t,<,0.01,0.9995224
2,614337945.xml,"t (41) = 8.21, p < .01",t,<,0.01,1.708961e-10
3,614337945.xml,"t (41) = 4.82, p < .01",t,<,0.01,9.9876e-06
4,614337945.xml,"t (41) = −2.57, p < .01",t,<,0.01,0.9930493


In [30]:
df_stats.shape

(212589, 6)

#### Export all recalculated p-values

In [31]:
df_stats.to_csv(out_path + 'stats_all.csv')