Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add option to calculate "n(%)" percentages over a row. Ref #108 #110

Merged
merged 2 commits into from
Jan 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@
# built documents.
#
# The short X.Y version.
version = u'0.7.9'
version = u'0.7.10'
# The full version, including alpha/beta/rc tags.
release = u'0.7.9'
release = u'0.7.10'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
# Versions should comply with PEP440. For a discussion on single-sourcing
# the version across setup.py and the project code, see
# https://packaging.python.org/en/latest/single_source_version.html
version='0.7.9',
version='0.7.10',

description='TableOne',
long_description=long_description,
Expand Down
2 changes: 1 addition & 1 deletion tableone/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
from .tableone import TableOne, load_dataset, tableone

__author__ = "Tom Pollard <tpollard@mit.edu>, Alistair Johnson, Jesse Raffa"
__version__ = "0.7.9"
__version__ = "0.7.10"
67 changes: 48 additions & 19 deletions tableone/tableone.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ class TableOne(object):
overall : bool, optional
If True, add an "overall" column to the table. Smd and p-value
calculations are performed only using stratified columns.
row_percent : bool, optional
If True, compute "n (%)" percentages for categorical variables across
"groupby" rows rather than columns.
display_all : bool, optional
If True, set pd. display_options to display all columns and rows.
(default: False)
Expand Down Expand Up @@ -206,9 +209,8 @@ def __init__(self, data: pd.DataFrame, columns: Optional[list] = None,
order: Optional[dict] = None, remarks: bool = False,
label_suffix: bool = True, decimals: Union[int, dict] = 1,
smd: bool = False, overall: bool = True,
display_all: bool = False,
dip_test: bool = False,
normal_test: bool = False,
row_percent: bool = False, display_all: bool = False,
dip_test: bool = False, normal_test: bool = False,
tukey_test: bool = False) -> None:

# labels is now rename
Expand Down Expand Up @@ -348,6 +350,7 @@ def __init__(self, data: pd.DataFrame, columns: Optional[list] = None,
self._decimals = decimals
self._smd = smd
self._overall = overall
self._row_percent = row_percent

# display notes and warnings below the table
self._warnings = {}
Expand Down Expand Up @@ -928,14 +931,14 @@ def _create_cont_describe(self, data, groupby):

return df_cont

def _format_cat(self, row):
def _format_cat(self, row, col):
var = row.name[0]
if var in self._decimals:
n = self._decimals[var]
else:
n = 1
f = '{{:.{}f}}'.format(n)
return f.format(row.percent)
return f.format(row[col])

def _create_cat_describe(self, data, groupby, groupbylvls):
"""
Expand All @@ -945,6 +948,10 @@ def _create_cat_describe(self, data, groupby, groupbylvls):
----------
data : pandas DataFrame
The input dataset.
groupby : Str
Variable to group by.
groupbylvls : List
List of levels in the groupby variable.

Returns
----------
Expand All @@ -953,46 +960,64 @@ def _create_cat_describe(self, data, groupby, groupbylvls):
"""
group_dict = {}

cat_slice = data[self._categorical].copy()

for g in groupbylvls:
if groupby:
d_slice = data.loc[data[groupby] == g, self._categorical]
df = cat_slice.loc[data[groupby] == g, self._categorical]
else:
d_slice = data[self._categorical].copy()
df = cat_slice.copy()

# create a dataframe with freq, proportion
df = d_slice.copy()
# create n column and null count column
# must be done before converting values to strings
ct = df.count().to_frame(name='n')
ct.index.name = 'variable'
nulls = df.isnull().sum().to_frame(name='Missing')
nulls.index.name = 'variable'

# convert to str to handle int converted to boolean. Avoid nans.
# Convert to str to handle int converted to boolean in the index.
# Also avoid nans.
for column in df.columns:
df[column] = [str(row) if not pd.isnull(row)
else None for row in df[column].values]
cat_slice[column] = [str(row) if not pd.isnull(row)
else None for row
in cat_slice[column].values]

# create a dataframe with freq, proportion
df = df.melt().groupby(['variable',
'value']).size().to_frame(name='freq')

df['percent'] = df['freq'].div(df.freq.sum(level=0),
level=0).astype(float) * 100

# add row percent
df['percent_row'] = df['freq'].div(cat_slice[self._categorical]
.melt()
.groupby(['variable', 'value'])
.size()) * 100

# set number of decimal places for percent
if isinstance(self._decimals, int):
n = self._decimals
f = '{{:.{}f}}'.format(n)
df['percent_str'] = df['percent'].astype(float).map(f.format)
df['percent_row_str'] = df['percent_row'].astype(float).map(f.format)
elif isinstance(self._decimals, dict):
df.loc[:, 'percent_str'] = df.apply(self._format_cat, axis=1)
df.loc[:, 'percent_str'] = df.apply(self._format_cat, axis=1,
args=['percent'])
df.loc[:, 'percent_row_str'] = df.apply(self._format_cat,
axis=1,
args=['percent_row'])
else:
n = 1
f = '{{:.{}f}}'.format(n)
df['percent_str'] = df['percent'].astype(float).map(f.format)
df['percent_row_str'] = df['percent_row'].astype(float).map(f.format)

# add n column, listing total non-null values for each variable
ct = d_slice.count().to_frame(name='n')
ct.index.name = 'variable'
# join count column
df = df.join(ct)

# add null count
nulls = d_slice.isnull().sum().to_frame(name='Missing')
nulls.index.name = 'variable'
# only save null count to the first category for each variable
# do this by extracting the first category from the df row index
levels = df.reset_index()[['variable',
Expand All @@ -1004,8 +1029,12 @@ def _create_cat_describe(self, data, groupby, groupbylvls):
df = df.join(nulls)

# add summary column
df['t1_summary'] = (df.freq.map(str) + ' ('
+ df.percent_str.map(str)+')')
if self._row_percent:
df['t1_summary'] = (df.freq.map(str) + ' ('
+ df.percent_row_str.map(str)+')')
else:
df['t1_summary'] = (df.freq.map(str) + ' ('
+ df.percent_str.map(str)+')')

# add to dictionary
group_dict[g] = df
Expand Down
158 changes: 155 additions & 3 deletions test_tableone.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import warnings

from nose.tools import (with_setup, assert_raises, assert_equal,
assert_almost_equal, assert_list_equal)
assert_almost_equal, assert_list_equal,
assert_count_equal)
import numpy as np
import pandas as pd
from scipy import stats
Expand Down Expand Up @@ -1097,8 +1098,6 @@ def test_min_max_for_nonnormal_variables(self):
# optionally, a categorical variable for stratification
groupby = ['death']

self.data_pn

t1 = TableOne(self.data_pn, columns=columns, categorical=categorical,
groupby=groupby, nonnormal=nonnormal, decimals=decimals,
min_max=['Age'])
Expand All @@ -1110,3 +1109,156 @@ def test_min_max_for_nonnormal_variables(self):
for c, e in zip(t1_columns, expected):
cell = t1.tableone.loc[k][group][c].values[0]
assert_equal(cell, e)

@with_setup(setup, teardown)
def test_row_percent_false(self):
"""
Test row_percent=False displays n(%) for the column.
"""
# columns to summarize
columns = ['Age', 'SysABP', 'Height', 'MechVent', 'ICU', 'death']

# columns containing categorical variables
categorical = ['ICU', 'MechVent']

# set decimal places for age to 0
decimals = {"Age": 0}

# non-normal variables
nonnormal = ['Age']

# optionally, a categorical variable for stratification
groupby = ['death']
group = "Grouped by death"

# row_percent = False
t1 = TableOne(self.data_pn, columns=columns,
categorical=categorical, groupby=groupby,
nonnormal=nonnormal, decimals=decimals,
row_percent=False)

row1 = list(t1.tableone.loc["MechVent, n (%)"][group].values[0])
row1_expect = [0, '540 (54.0)', '468 (54.2)', '72 (52.9)']
assert_list_equal(row1, row1_expect)

row2 = list(t1.tableone.loc["MechVent, n (%)"][group].values[1])
row2_expect = ['', '460 (46.0)', '396 (45.8)', '64 (47.1)']
assert_list_equal(row2, row2_expect)

row3 = list(t1.tableone.loc["ICU, n (%)"][group].values[0])
row3_expect = [0, '162 (16.2)', '137 (15.9)', '25 (18.4)']
assert_list_equal(row3, row3_expect)

row4 = list(t1.tableone.loc["ICU, n (%)"][group].values[1])
row4_expect = ['', '202 (20.2)', '194 (22.5)', '8 (5.9)']
assert_list_equal(row4, row4_expect)

row5 = list(t1.tableone.loc["ICU, n (%)"][group].values[2])
row5_expect = ['', '380 (38.0)', '318 (36.8)', '62 (45.6)']
assert_list_equal(row5, row5_expect)

row6 = list(t1.tableone.loc["ICU, n (%)"][group].values[3])
row6_expect = ['', '256 (25.6)', '215 (24.9)', '41 (30.1)']
assert_list_equal(row6, row6_expect)

@with_setup(setup, teardown)
def test_row_percent_true(self):
"""
Test row_percent=True displays n(%) for the row rather than the column.
"""
# columns to summarize
columns = ['Age', 'SysABP', 'Height', 'MechVent', 'ICU', 'death']

# columns containing categorical variables
categorical = ['ICU', 'MechVent']

# set decimal places for age to 0
decimals = {"Age": 0}

# non-normal variables
nonnormal = ['Age']

# optionally, a categorical variable for stratification
groupby = ['death']
group = "Grouped by death"

# row_percent = True
t2 = TableOne(self.data_pn, columns=columns,
categorical=categorical, groupby=groupby,
nonnormal=nonnormal, decimals=decimals,
row_percent=True)

row1 = list(t2.tableone.loc["MechVent, n (%)"][group].values[0])
row1_expect = [0, '540 (100.0)', '468 (86.7)', '72 (13.3)']
assert_list_equal(row1, row1_expect)

row2 = list(t2.tableone.loc["MechVent, n (%)"][group].values[1])
row2_expect = ['', '460 (100.0)', '396 (86.1)', '64 (13.9)']
assert_list_equal(row2, row2_expect)

row3 = list(t2.tableone.loc["ICU, n (%)"][group].values[0])
row3_expect = [0, '162 (100.0)', '137 (84.6)', '25 (15.4)']
assert_list_equal(row3, row3_expect)

row4 = list(t2.tableone.loc["ICU, n (%)"][group].values[1])
row4_expect = ['', '202 (100.0)', '194 (96.0)', '8 (4.0)']
assert_list_equal(row4, row4_expect)

row5 = list(t2.tableone.loc["ICU, n (%)"][group].values[2])
row5_expect = ['', '380 (100.0)', '318 (83.7)', '62 (16.3)']
assert_list_equal(row5, row5_expect)

row6 = list(t2.tableone.loc["ICU, n (%)"][group].values[3])
row6_expect = ['', '256 (100.0)', '215 (84.0)', '41 (16.0)']
assert_list_equal(row6, row6_expect)

@with_setup(setup, teardown)
def test_row_percent_true_and_overall_false(self):
"""
Test row_percent=True displays n(%) for the row rather than the column.
"""
# columns to summarize
columns = ['Age', 'SysABP', 'Height', 'MechVent', 'ICU', 'death']

# columns containing categorical variables
categorical = ['ICU', 'MechVent']

# set decimal places for age to 0
decimals = {"Age": 0}

# non-normal variables
nonnormal = ['Age']

# optionally, a categorical variable for stratification
groupby = ['death']
group = "Grouped by death"

# row_percent = True
t1 = TableOne(self.data_pn, columns=columns, overall=False,
categorical=categorical, groupby=groupby,
nonnormal=nonnormal, decimals=decimals,
row_percent=True)

row1 = list(t1.tableone.loc["MechVent, n (%)"][group].values[0])
row1_expect = [0, '468 (86.7)', '72 (13.3)']
assert_list_equal(row1, row1_expect)

row2 = list(t1.tableone.loc["MechVent, n (%)"][group].values[1])
row2_expect = ['', '396 (86.1)', '64 (13.9)']
assert_list_equal(row2, row2_expect)

row3 = list(t1.tableone.loc["ICU, n (%)"][group].values[0])
row3_expect = [0, '137 (84.6)', '25 (15.4)']
assert_list_equal(row3, row3_expect)

row4 = list(t1.tableone.loc["ICU, n (%)"][group].values[1])
row4_expect = ['', '194 (96.0)', '8 (4.0)']
assert_list_equal(row4, row4_expect)

row5 = list(t1.tableone.loc["ICU, n (%)"][group].values[2])
row5_expect = ['', '318 (83.7)', '62 (16.3)']
assert_list_equal(row5, row5_expect)

row6 = list(t1.tableone.loc["ICU, n (%)"][group].values[3])
row6_expect = ['', '215 (84.0)', '41 (16.0)']
assert_list_equal(row6, row6_expect)