# Project Psyched: A Closer Look Into Reproducibility In Psychological Research

## Data Analysis & Visualization Script: Part 2 - Test Statistics & Recalcuating P-Values
This script for data analysis and visualization after data has been scraped from TDM Studio. This part of the project utilizes the full corpus of both #1 and #2.

Author: Yuyang Zhong (2020). This work is licensed under a [Creative Commons BY-NC-SA 4.0 International
License][cc-by].

![CC BY-NC-SA 4.0][cc-by-shield]

[cc-by]: http://creativecommons.org/licenses/by/4.0/
[cc-by-shield]: https://img.shields.io/badge/license-CC--BY--NC--SA%204.0-blue

#### Setup & Imports

In [1]:
import json
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import ast

from scipy import stats

In [2]:
in_path = "../data/"
in_name = "stats_all.csv"

In [3]:
df = pd.read_csv(in_path + in_name, index_col=1).drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0_level_0,Original,Type,Sign,Reported p-value,Recalculated p-value
File,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
614337945.xml,"t (41) = 4.10, p < .01",t,<,0.01,9.531027e-05
614337945.xml,"t (41) = −3.56, p < .01",t,<,0.01,0.9995224
614337945.xml,"t (41) = 8.21, p < .01",t,<,0.01,1.708961e-10
614337945.xml,"t (41) = 4.82, p < .01",t,<,0.01,9.9876e-06
614337945.xml,"t (41) = −2.57, p < .01",t,<,0.01,0.9930493


In [17]:
df.index.value_counts()

1640024140.xml    404
2316529621.xml    320
2259585220.xml    224
1824548299.xml    170
614498611.xml     168
                 ... 
614344891.xml       1
614329345.xml       1
1314700380.xml      1
2084021455.xml      1
614332887.xml       1
Name: File, Length: 13220, dtype: int64

#### Import Metadata

In [4]:
meta = pd.read_csv(in_path + "metadata_all.csv", index_col=0)
meta = meta[['Journal', 'Date Published']]
meta.head()

Unnamed: 0,Journal,Date Published
614337945.xml,Journal of Personality and Social Psychology,1987-03-01
1647028895.xml,Journal of Personality and Social Psychology,2015-01-01
614404963.xml,Journal of Personality and Social Psychology,2002-07-01
614332724.xml,Journal of Personality and Social Psychology,1997-11-01
614304222.xml,Journal of Personality and Social Psychology,1990-11-01


#### Standardize Journal of Abnormal Psychology

In [5]:
meta['Journal'].value_counts()

American Psychologist                                                  7935
Journal of Applied Psychology                                          6421
Journal of Personality and Social Psychology                           6048
Developmental Psychology                                               2074
Journal of Abnormal Psychology                                         1758
The Journal of Abnormal Psychology                                      610
Journal of Experimental Psychology: Learning, Memory, and Cognition     169
Journal of Experimental Social Psychology                                 1
Name: Journal, dtype: int64

In [6]:
meta['Journal'] = meta['Journal'].replace('The Journal of Abnormal Psychology', 'Journal of Abnormal Psychology')
meta['Journal'].value_counts()

American Psychologist                                                  7935
Journal of Applied Psychology                                          6421
Journal of Personality and Social Psychology                           6048
Journal of Abnormal Psychology                                         2368
Developmental Psychology                                               2074
Journal of Experimental Psychology: Learning, Memory, and Cognition     169
Journal of Experimental Social Psychology                                 1
Name: Journal, dtype: int64

#### Merge metadata with statistics data frame

In [34]:
df_merged = df.merge(meta, how='outer', left_index=True, right_index=True)
df_merged.head()

Unnamed: 0,Original,Type,Sign,Reported p-value,Recalculated p-value,Journal,Date Published
1011297993.xml,"F(1, 137) = 0.01, p = .906",f,=,0.906,0.9204905,,
1011297993.xml,"F(1, 139) = 2.46, p = .119",f,=,0.119,0.1190525,,
1011297999.xml,"F(1, 40) = 7.90, p < .01",f,<,0.01,0.007617572,Journal of Personality and Social Psychology,2012-07-01
1011297999.xml,"F(1, 40) = 8.85, p < .01",f,<,0.01,0.004950714,Journal of Personality and Social Psychology,2012-07-01
1011297999.xml,"F(1, 113) = 66.53, p < .0001",f,<,0.0001,5.296874e-13,Journal of Personality and Social Psychology,2012-07-01


In [35]:
df_merged['Journal'].value_counts()

Journal of Personality and Social Psychology                           100341
American Psychologist                                                    7935
Journal of Applied Psychology                                            6421
Journal of Abnormal Psychology                                           2368
Developmental Psychology                                                 2074
Journal of Experimental Psychology: Learning, Memory, and Cognition       169
Journal of Experimental Social Psychology                                   1
Name: Journal, dtype: int64

In [36]:
df_merged[df_merged['Journal'].isna()].index.unique()

Index(['1011297993.xml', '1011298051.xml', '1011856265.xml', '1011856272.xml',
       '1011856414.xml', '1011857743.xml', '1011857817.xml', '1011857821.xml',
       '1011858438.xml', '1011860155.xml',
       ...
       '953199137.xml', '953199778.xml', '964198061.xml', '964198107.xml',
       '964198125.xml', '964198148.xml', '964198559.xml', '964198585.xml',
       '964198590.xml', '993315768.xml'],
      dtype='object', length=8625)

In [None]:
df_stats[df_stats['Recalculated p-value'] > 1].head()

In [None]:
df_stats[df_stats['Reported p-value'] > 1].head()

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(x='Reported p-value', y='Recalculated p-value', 
                data=df_stats[(df_stats['Reported p-value'] < 1) & (df_stats['Recalculated p-value'] < 1)], 
                alpha=0.1, hue='Type')
sns.lineplot(np.linspace(0, 1, 100), np.linspace(0, 1, 100), color='green', linewidth=3, alpha=0.3);

#### Naive correction for one- and two-tail t-test p-values

In [None]:
df_stats_corrected = pd.DataFrame(columns=df_stats.columns)

for index, row in df_stats.iterrows():
    if row['Type'] == 't':
        if row['Recalculated p-value'] > 0.05:
            row['Recalculated p-value'] = 1 - row['Recalculated p-value']
        row['Recalculated p-value'] = row['Recalculated p-value'] * 2
    df_stats_corrected = df_stats_corrected.append(row)

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(x='Reported p-value', y='Recalculated p-value', 
                data=df_stats_corrected[(df_stats['Reported p-value'] < 1) & (df_stats_corrected['Recalculated p-value'] < 1)], 
                alpha=0.1, hue='Type')
sns.lineplot(np.linspace(0, 1, 100), np.linspace(0, 1, 100), color='green', linewidth=3, alpha=0.3);

#### More Plots

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(x='Reported p-value', y='Recalculated p-value', 
                data=df_stats_corrected[(df_stats_corrected['Reported p-value'] <= 0.05) 
                                        & (df_stats_corrected['Recalculated p-value'] < 1)], 
                alpha=0.1, hue='Type')
sns.lineplot(np.linspace(0, 0.05, 100), np.linspace(0, 0.05, 100), color='red', linewidth=3, alpha=1);

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(x='Reported p-value', y='Recalculated p-value', 
                data=df_stats_corrected[(df_stats_corrected['Reported p-value'] <= 0.05) 
                                        & (df_stats_corrected['Recalculated p-value'] < 0.05)], 
                alpha=0.1, hue='Type')
sns.lineplot(np.linspace(0, 0.05, 100), np.linspace(0, 0.05, 100), color='red', linewidth=3, alpha=1);

In [None]:
plt.figure(figsize=(10, 8))
sns.histplot(df_stats_corrected[(df_stats_corrected['Reported p-value'] == 0.05) 
                      & (df_stats_corrected['Recalculated p-value'] < 1) 
                                & (df_stats_corrected['Recalculated p-value'] > 0.05)]['Recalculated p-value'])