Skip to content

Commit

Permalink
replace loop with vector op
Browse files Browse the repository at this point in the history
  • Loading branch information
tompollard committed Feb 2, 2018
1 parent 85f5097 commit 60ac9d5
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 26 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
# Versions should comply with PEP440. For a discussion on single-sourcing
# the version across setup.py and the project code, see
# https://packaging.python.org/en/latest/single_source_version.html
version='0.4.2',
version='0.4.3',

description='TableOne',
long_description=long_description,
Expand Down
47 changes: 22 additions & 25 deletions tableone.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""

__author__ = "Tom Pollard <tpollard@mit.edu>, Alistair Johnson"
__version__ = "0.4.2"
__version__ = "0.4.3"

import pandas as pd
import csv
Expand Down Expand Up @@ -161,30 +161,27 @@ def _create_cat_describe(self,data):
d_slice = data
cats = {}

for v in self.categorical:
ds = d_slice[v].astype('category')
levels = ds[ds.notnull()].unique().categories.sort_values()
df = pd.DataFrame(index = levels)
# clean later
# add descriptive details
df['n'] = ds.count()
df['isnull'] = data[v].isnull().sum()
df['level'] = levels
df = df.merge(ds.value_counts(dropna=True).to_frame().rename(columns= {v:'freq'}),
left_on='level',right_index=True, how='left')
df['freq'].fillna(0,inplace=True)
df['percent'] = (df['freq'] / df['n']) * 100
# set level as index to df
df.set_index('level', inplace=True)
cats[v] = df

cats_df = pd.concat(cats)
cats_df.index.rename('variable',level=0, inplace=True)

cats_df['t1_summary'] = cats_df.freq.map(str) \
+ ' (' + cats_df.percent.apply(round, ndigits=2).map(str) + ')'

group_dict[g] = cats_df
# create a dataframe with freq, proportion
df = d_slice[self.categorical].copy()
df = df.melt().groupby(['variable','value']).size().to_frame(name='freq')
df.index.set_names('level', level=1, inplace=True)
df['percent'] = df['freq'].div(df.freq.sum(level=0),level=0)* 100

# add n column, listing total non-null values for each variable
ct = d_slice.count().to_frame(name='n')
ct.index.name = 'variable'
df = df.join(ct)

# add null count
nulls = d_slice.isnull().sum().to_frame(name='isnull')
nulls.index.name = 'variable'
df = df.join(nulls)

# add summary column
df['t1_summary'] = df.freq.map(str) + ' (' + df.percent.apply(round, ndigits=2).map(str) + ')'

# add to dictionary
group_dict[g] = df

df_cat = pd.concat(group_dict,axis=1)

Expand Down

0 comments on commit 60ac9d5

Please sign in to comment.