<a href="https://colab.research.google.com/github/suzannefox/burrow/blob/main/burrow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
pip install palmerpenguins



In [None]:
import pandas as pd
import numpy as np
from palmerpenguins import load_penguins

In [None]:
# Supporting Functions

# return excel col equivalent of a numerical value
def excelcol(number):
  if number > 16384: return '16384 max'
  if number < 1: return 'must be > 0'

  letters = ""
  while number:
    number, remainder = divmod(number - 1, 26)
    letters = chr(65 + remainder) + letters

  return letters

# Define the function to return minimum value for numeric columns or first alphabetical occurrence for string columns
def custom_min(variable):
  if pd.api.types.is_numeric_dtype(variable.dropna()):
    return variable.dropna().min()  # Drop NaNs and get the minimum value
  elif pd.api.types.is_string_dtype(variable.dropna()):
    return variable.dropna().sort_values(ascending=True).reset_index(drop=True)[0]  # Sort descending and get the first value
  else:
    return None

# Define the function to return minimum value for numeric variables or first alphabetical occurrence for string variables
def custom_max(variable):
  if pd.api.types.is_numeric_dtype(variable.dropna()):
    return variable.dropna().max()  # Drop NaNs and get the minimum value
  elif pd.api.types.is_string_dtype(variable.dropna()):
    return variable.dropna().sort_values(ascending=False).reset_index(drop=True)[0]  # Sort and get the first alphabetical value
  else:
    return None

# return mean if a variable is numeric, - otherwise
def custom_mean(variable):
  if pd.api.types.is_numeric_dtype(variable.dropna()):
    return variable.dropna().mean().round(1)
  else:
    return '-'

# return skew if a variable is numeric, - otherwise
def custom_skew(variable):
  if pd.api.types.is_numeric_dtype(variable.dropna()):
    return variable.dropna().skew().round(1)
  else:
    return '-'

# determine a more explicit data type
def custom_type(variable):
  if pd.api.types.is_numeric_dtype(variable.dropna()):
    return('numeric')
  elif pd.api.types.is_string_dtype(variable.dropna()):
    return('string')
  elif pd.api.types.is_bool_dtype(variable.dropna()):
    return('bool')
  elif pd.api.types.is_datetime64_dtype(variable.dropna()):
    return('datetime')
  elif pd.api.types.is_object_dtype(variable.dropna()):
    return('object')
  elif pd.api.types.is_integer_dtype(variable.dropna()):
    return('integer')
  elif pd.api.types.is_float_dtype(variable.dropna()):
    return('float')
  elif pd.api.types.is_categorical_dtype(variable.dropna()):
    return('categorical')
  elif pd.api.types.is_sparse_dtype(variable.dropna()):
    return('sparse')
  elif pd.api.types.is_generic_dtype(variable.dropna()):
    return('generic')
  elif pd.api.types.is_complex_dtype(variable.dropna()):
    return('complex')
  else:
    return None


In [None]:
def dfinfo(df, catalog=None, diagnostics=False, verbose=True):

  if df.empty:
    print('... dataframe is empty')
    return

  if verbose:
    df_name = [name for name in globals() if globals()[name] is df][0]
    print(f'... dataframe {df_name} has {df.shape[0]} records and {df.shape[1]} variables')

  # Main Body
  dfout = pd.DataFrame({'Variables': df.columns})

  dfout['Order'] = range(1, df.shape[1] + 1)
  dfout['Excel'] = dfout['Order'].apply(excelcol)

  if catalog is not None:
    if 'Variables' not in catalog.columns:
      print(" ... WARNING: catalog dataframe missing a 'Variables' column, ignoring")
    else:
      dfout = dfout.merge(catalog, on='Variables', how='left').fillna('')

  dfout['Class'] = df.dtypes.astype(str).values
  dfout['Type'] = df.apply(custom_type).reset_index(drop=True)

  dfout['NA_Tot'] = df.isna().sum().reset_index(drop=True)
  dfout['NA_PC'] = (df.isna().sum().reset_index(drop=True) / len(df) * 100).apply(lambda x: f"{x:.0f}%")

  dfout['Unique_Tot'] = df.nunique().reset_index(drop=True)
  dfout['Unique_PC'] = (df.nunique().reset_index(drop=True) / len(df) * 100).apply(lambda x: f"{x:.0f}%")

  # Calculate min for numeric columns only
  dfout['Min'] = df.apply(custom_min).reset_index(drop=True)
  # Calculate max for numeric columns only
  dfout['Max'] = df.apply(custom_max).reset_index(drop=True)

  dfout['Mean'] = df.apply(custom_mean).reset_index(drop=True)
  dfout['Skew'] = df.apply(custom_skew).reset_index(drop=True)

  return dfout

# mount local drive

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


# Penguins

In [None]:
penguins = load_penguins()

In [None]:
catalog = pd.DataFrame({'Variables': ['species','sex','island'],
                        'catalog': ['Penguin Species','Gender','Island they live on']})
dfinfo(penguins, catalog = catalog)

... dataframe penguins has 344 records and 8 variables


Unnamed: 0,Variables,Order,Excel,catalog,Class,Type,NA_Tot,NA_PC,Unique_Tot,Unique_PC,Min,Max,Mean,Skew
0,species,1,A,Penguin Species,object,string,0,0%,3,1%,Adelie,Gentoo,-,-
1,island,2,B,Island they live on,object,string,0,0%,3,1%,Biscoe,Torgersen,-,-
2,bill_length_mm,3,C,,float64,numeric,2,1%,164,48%,32.1,59.6,43.9,0.1
3,bill_depth_mm,4,D,,float64,numeric,2,1%,80,23%,13.1,21.5,17.2,-0.1
4,flipper_length_mm,5,E,,float64,numeric,2,1%,55,16%,172.0,231.0,200.9,0.3
5,body_mass_g,6,F,,float64,numeric,2,1%,94,27%,2700.0,6300.0,4201.8,0.5
6,sex,7,G,Gender,object,string,11,3%,2,1%,female,male,-,-
7,year,8,H,,int64,numeric,0,0%,3,1%,2007,2009,2008.0,-0.1


# Alzheimers data

In [None]:
dfalz = pd.read_csv('/gdrive/MyDrive/data/alzheimer_data.csv')
dfinfo(dfalz)

... dataframe dfalz has 284142 records and 31 variables


Unnamed: 0,Variables,Order,Excel,Class,Type,NA_Tot,NA_PC,Unique_Tot,Unique_PC,Min,Max,Mean,Skew
0,RowId,1,A,object,string,0,0%,36046,13%,BRFSS~2015~2015~01~Q01~TNC01~AGE~GENDER,BRFSS~2022~2022~9004~Q46~TOC10~AGE~RACE,-,-
1,YearStart,2,B,int64,numeric,0,0%,8,0%,2015,2022,2018.6,-0.1
2,YearEnd,3,C,int64,numeric,0,0%,8,0%,2015,2022,2018.7,-0.1
3,LocationAbbr,4,D,object,string,0,0%,59,0%,AK,WY,-,-
4,LocationDesc,5,E,object,string,0,0%,59,0%,Alabama,Wyoming,-,-
5,Datasource,6,F,object,string,0,0%,1,0%,BRFSS,BRFSS,-,-
6,Class,7,G,object,string,0,0%,7,0%,Caregiving,Smoking and Alcohol Use,-,-
7,Topic,8,H,object,string,0,0%,39,0%,Arthritis among older adults,Up-to-date with recommended vaccines and scree...,-,-
8,Question,9,I,object,string,0,0%,39,0%,Average of 20 or more hours of care per week p...,Severe joint pain due to arthritis among older...,-,-
9,Data_Value_Unit,10,J,object,string,0,0%,2,0%,%,Number,-,-


# Housing data

In [None]:
dfhou = pd.read_csv('/gdrive/MyDrive/data/housing.csv')
dfinfo(dfhou)

... dataframe dfhou has 1460 records and 81 variables


Unnamed: 0,Variables,Order,Excel,Class,Type,NA_Tot,NA_PC,Unique_Tot,Unique_PC,Min,Max,Mean,Skew
0,Id,1,A,int64,numeric,0,0%,1460,100%,1,1460,730.5,0.0
1,MSSubClass,2,B,int64,numeric,0,0%,15,1%,20,190,56.9,1.4
2,MSZoning,3,C,object,string,0,0%,5,0%,C (all),RM,-,-
3,LotFrontage,4,D,float64,numeric,259,18%,110,8%,21.0,313.0,70.0,2.2
4,LotArea,5,E,int64,numeric,0,0%,1073,73%,1300,215245,10516.8,12.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,MoSold,77,BY,int64,numeric,0,0%,12,1%,1,12,6.3,0.2
77,YrSold,78,BZ,int64,numeric,0,0%,5,0%,2006,2010,2007.8,0.1
78,SaleType,79,CA,object,string,0,0%,9,1%,COD,WD,-,-
79,SaleCondition,80,CB,object,string,0,0%,6,0%,Abnorml,Partial,-,-
