## General Utility 

In [2]:
import numpy as np
from collections import Counter
import math
from main import *

In [26]:
# -----------------------------------------------
#   LOGIC FUNCTIONS
# -----------------------------------------------
def isNone(var,then=None,els=None):
    """
    @Description: Check if a value is None. The typical boolean expression `if var == None` may give rise to error when var is a list/array.

    When `then` != `None` and/or `else_` != `None` 
    - return `then` if `var` == `None` 
    - return `else_` if if `var` != `None` 

    """
    is_None = isinstance(var,type(None))
    then_return = not isinstance(then,type(None))
    else_return = not isinstance(els,type(None))
    # None -> return True or then
    # not None -> return False or else_
    if then_return:
        if is_None:
            return then
        elif else_return:
            return els
        else:
            return var
    else:
        return is_None

def converse(var,choices):
    assert len(choices)==2, "The converse of more than 2 choices is ambiguous"
    return choices[0] if var == choices[1] else choices[1]


def __test__():
    v1 = None
    v2 = "Not None"
    print(isNone(v1))
    print(isNone(v1,then=2))
    print(isNone(v2,then=2))
    print(isNone(v2,then=2,els=3))

__test__()

True
2
Not None
3


In [70]:
# -----------------------------------------------
#   ARRAY MANIPULATION
# -----------------------------------------------
def is_unique(arr,rate=False):
    """
    Return a boolean value for whether all elements in the array is unique, or as a rate
    """
    if rate:
        return len(np.unique(arr))/len(arr)
    else:
        return len(np.unique(arr)) == len(arr)

def overlap(*arrs):
    """
    Return the set of overlapped elements, i.e. the intersection, in the form of an array
    """
    overlap_set = set(arrs[0])
    for arr in arrs[1:]:
        overlap_set = overlap_set.intersection(set(arr))
    return np.array(list(overlap_set))

def union(*arrs):
    """
    Return the union set of all arrays, in the form of an array
    """
    union_set = set()
    for arr in arrs:
        union_set = union_set.union(set(arr))
    return np.array(list(union_set))

def difference(*arrs,how="outer"):
    """
    Return the difference elements in the form of an array
    Parameters:
        - how = `"outer"`: difference between the union set with the overlap set, i.e., elements that do not appear in all sets
        - how = `"left"`: differece between the left (or first) set with every other set
        - how = `"right"`: differece between the right (or last) set with every other set

    """
    difference_sets = {
        "outer": set(union(*arrs)) - set(overlap(*arrs)),
        "left" : set(arrs[0]) - set(overlap(*arrs)),
        "right": set(arrs[-1]) - set(overlap(*arrs))
    }
    return np.array(list(difference_sets[how]))

def label_counts(arr,labels=None):
    """ 
    @Description: Return a dict of label_counts and labels of a list-like object
    @Parameters:
        - arr: list-like object (LLO)
        - labels: labels to be included in the counting, even those not in the LLO. Order-sensitive
    """
    # arr_ = np.array(arr).ravel() # Omitted dtype=object
    # labels = np.unique(arr_) if isNone(labels) else np.array(labels)
    # return {'counts':np.array([Counter(arr_)[lab] for lab in labels]),'labels':labels,'num_classes':len(labels)}
    
    arr_copy = as_1d_array(arr)
    if isNone(labels):
        labels = np.sort(np.unique(arr_copy))
    else:
        labels = np.sort(as_1d_array(labels))
    counts = np.array([Counter(arr_copy)[lab] for lab in labels])

    lab_cnt_obj = DictObj("Label Counts",
        counts = counts,
        labels = labels,
        total_count = np.sum(counts),
        num_classes = len(labels),
    )
    return lab_cnt_obj


def __test__():
    # Test fmt_list_2_str
    np.random.seed(1)
    a = np.random.randint(3,size=20)
    print(a)
    a_cnt = label_counts(a)
    print(a_cnt)
    print(a_cnt.values(["counts","labels"]))
    b = list("abbbabccaabc")
    print(b)
    b_cnt = label_counts(b)
    print(b_cnt)

__test__()

[1 0 0 1 1 0 0 1 0 1 0 2 1 2 0 2 1 2 0 0]
Label Counts {'counts': array([9, 7, 4]), 'labels': array([0, 1, 2]), 'total_count': 20, 'num_classes': 3}
dict_values([array([9, 7, 4]), array([0, 1, 2])])
['a', 'b', 'b', 'b', 'a', 'b', 'c', 'c', 'a', 'a', 'b', 'c']
Label Counts {'counts': array([4, 5, 3]), 'labels': array(['a', 'b', 'c'], dtype='<U1'), 'total_count': 12, 'num_classes': 3}


In [28]:
# -----------------------------------------------
#   DATA TYPES MANIPULATION
# -----------------------------------------------

def dict_subset(dict_obj, keys):
    """
    Return a subset of the dict object based on the keys
    """
    return {key:dict_obj[key] for key in list(keys)}


In [29]:
def enc_str_fr_np(arr,sep=',',br="[|]"):
    """
    Encode an array as a string
    """
    if isNone(sep): 
        sep = ""
    if isNone(br):
        return f"{sep.join(np.array(arr).astype(str))}"
    else:
        return f"{br[0]}{sep.join(np.array(arr).astype(str))}{br[-1]}"

# def np_fr_str(string,dtype=int):
#     return np.array(list(string)).astype(dtype)

def dec_np_fr_str(string,dtype=int,sep=',',br="[|]"):
    """
    Decode an array from a string
    """
    if isNone(sep):
        strings = string.strip(br)
    else: 
        strings = string.strip(br).split(sep)
    return np.array([dtype(n) for n in strings])

def __test__():
    # Test fmt_list_2_str
    a = [0.0,1.0,1.0,0.0,1.0,0]
    print(enc_str_fr_np(a))
    print(enc_str_fr_np(a,None,None))
    b = "13456"
    print(dec_np_fr_str(b,sep=None))
    c = "[1,2,3,4]"
    print(dec_np_fr_str(c))
    # print(b.strip("[|]"))
    # print(list_fr_str(b)) # Still Buggy

__test__()

[0.0,1.0,1.0,0.0,1.0,0.0]
0.01.01.00.01.00.0
[1 3 4 5 6]
[1 2 3 4]


In [30]:
# -----------------------------------------------
#   MATH FUNCTIONS
# -----------------------------------------------
def clamp(val,lower=0,upper=1,default_nan=0):
    """
    @Description: Clamp a numerical value between lower and upper. 
    """
    return max(lower,min(val,upper))


# -----------------------------------------------
#   STRING FORMATTING
# -----------------------------------------------

def fmt_time(seconds):
    """
    Convert time in seconds to a string representation of the format hh:mm:ss
    """
    seconds = int(seconds)
    hours = math.floor(seconds / 3600)
    minutes = math.floor((seconds % 3600) / 60)
    odd_secs = seconds % 60
    if hours < 10: 
        hh = f'0{hours}' 
    else: 
        hh = f'{hours}'
    if minutes < 10 : 
        mm = f'0{minutes}'
    else : 
        mm = f'{minutes}'
    if odd_secs < 10 : 
        ss = f'0{int(odd_secs)}'
    else: 
        ss = f'{int(odd_secs)}'
    return f'{hh}:{mm}:{ss}'



## Helper Classes

In [31]:
from main import *
import time

# -----------------------------------------------
#   DICT LIKE DUMMY OBJECT
# -----------------------------------------------
class DictObj():
    """
    @Description: Object that serves as a namespace for related attributes within other classes
    @Example:
        cpa = CPA1()\n
        cpa.var1 = DictObj()\n
        cpa.var1.inputs = [1,2,3]\n
        print(cpa.var1.inputs)\n
        >> [1, 2, 3]\n
        cpa.var2 = DictObj({'a':1,'b':2})\n
        print(cpa.var2.a,cpa.var2.b)\n
        >> 1 2

    """
    def __init__(self,__name__="My Object",**kwargs):
        super(DictObj,self)
        self.__name__ = __name__
        self.dict_ = {}
        self.update(**kwargs)

    def update(self,**kwargs):
        if kwargs:
            self.dict_.update(kwargs)
            for key,value in kwargs.items():
                self.__setattr__(key,value)
    
    def dict(self,keys=None):
        """
        Return the key-values dictionary for all or a subset of attributes
        """
        if isNone(keys):
            return self.dict_
        else:
            return dict_subset(self.dict_,keys)

    def to_frame(self,keys=None,index=None):
        """
        Return the DataFrame for all or a subset of attributes
        """
        index = isNone(index,then=[0])
        return pd.DataFrame(self.dict(keys),index=index)

    def values(self,keys=None):
        """
        Return the values for a subset of attributes
        """
        return self.dict(keys).values()

    def __str__(self):
        return f'{self.__name__} {self.dict_}'

def __test__():
    # Test __init__() and __str()__
    a = DictObj(
        k1 = 1,
        k2 = 2,
        k3 = 3,
    )
    print(a)
    # Test update()
    a.update(
        k1 = 9,
        k4 = 10
    )
    print(a)
    a.update(**{
        'k2' : 11 
    })
    print(a)
    # Test dict()
    print(a.dict())
    print(a.dict(["k1","k2","k4"]))
    # Test values()
    print(a.values())
    print(a.values(['k1','k2','k4']))
    
__test__()

My Object {'k1': 1, 'k2': 2, 'k3': 3}
My Object {'k1': 9, 'k2': 2, 'k3': 3, 'k4': 10}
My Object {'k1': 9, 'k2': 11, 'k3': 3, 'k4': 10}
{'k1': 9, 'k2': 11, 'k3': 3, 'k4': 10}
{'k1': 9, 'k2': 11, 'k4': 10}
dict_values([9, 11, 3, 10])
dict_values([9, 11, 10])


In [32]:
# -----------------------------------------------
#   TIMER FOR PROCESSES/TASKS
# -----------------------------------------------

class ProcessTimer:
    def __init__(self) -> None:
        self.start_ = {}
        self.prev_ = {}
        self.curr_ = {}
        self.NEW_ID = 0;

    def start(self,job_id=None):
        job_id = self.NEW_ID if job_id is None else job_id
        self.start_[job_id] = time.time()
        self.curr_[job_id] = self.start_[job_id]
        self.prev_[job_id] = -1
        self.NEW_ID += 1

    def record(self,job_id=0):
        self.prev_[job_id] = self.curr_[job_id]
        self.curr_[job_id] = time.time()

    def execute(self,func,job_id=-1,**func_args):
        """
        Record the time for executing a function.

        Return 
        ---------
        Return the time of execution followed by the function followed by the return value(s)
        """
        self.start(job_id)
        return_val = func(**func_args)
        self.record(job_id)
        if isNone(return_val):
            return self.time_elapsed(job_id)
        else:
            return self.time_elapsed(job_id),return_val

    def step_elapsed(self,job_id=0):
        return -1 if (job_id not in self.prev_.keys() or job_id not in self.curr_.keys()) else self.curr_[job_id] - self.prev_[job_id]

    def time_elapsed(self,job_id=0):
        return -1 if (job_id not in self.start_.keys() or job_id not in self.curr_.keys()) else self.curr_[job_id] - self.start_[job_id]
        


## Pandas Extension

In [33]:
from typing import Iterable, Tuple
import numpy as np
import pandas as pd
from main import *

#----------------------------------------
#   EXTRACT DATA FRAME INFORMATION
#----------------------------------------

# def summary(df:pd.DataFrame or pd.Series) -> pd.DataFrame or pd.Series:
#     """
#     Summary table for each columns in the dataframe
#     """
#     df_sum = pd.DataFrame(index=df.columns,columns=["dtypes","length","unique","samples","mode","range","mean","std","fill"])
#     df_sum.index.name = "columns"
#     for c in df.columns:
#         df_c_notna = df[c][df[c].notna()]
#         df_sum.loc[c,"dtypes"] = df[c].dtypes
#         df_sum.loc[c,"samples"] = (list(df[c].value_counts(ascending=False).index.values[:5]))
#         df_sum.loc[c,"length"] = len(df[c])
#         df_sum.loc[c,"unique"] = len(df[c].unique())
#         df_sum.loc[c,"mode"] = [] if len(df_c_notna.mode()) == 0 else as_1d_array(df_c_notna.mode())[0]
#         df_sum.loc[c,"fill"] = np.round(len(df_c_notna)/len(df[c]),2)
#         if isinstance(df[c].dtype,(type(np.dtype("float64")),type(np.dtype("int64")))):
#             df_sum.loc[c,"range"] = np.round([df_c_notna.min(),df_c_notna.max()],4)
#             df_sum.loc[c,"mean"] = np.round(df_c_notna.mean(),4)
#             df_sum.loc[c,"std"] = np.round(df_c_notna.std(),4)
#     return df_sum

def summary(df:pd.DataFrame or pd.Series) -> pd.DataFrame or pd.Series:
    """
    Summary table for each columns in the dataframe
    """
    df_sum = pd.DataFrame(index=df.columns,columns=["dtypes","length","unique","samples","mode","range","mean","std","fill"])
    df_sum.index.name = "columns"
    for c in df.columns:
        df_sum.loc[c,"dtypes"] = df[c].dtypes
        df_sum.loc[c,"samples"] = (list(df[c].value_counts(ascending=False).index.values[:5]))
        df_sum.loc[c,"length"] = len(df[c])
        df_sum.loc[c,"unique"] = len(df[c].unique())
        df_sum.loc[c,"mode"] = [] if len(df[c].mode()) == 0 else as_1d_array(df[c].mode())[0]

        df_c_notna = df[c][df[c].notna()]
        df_sum.loc[c,"fill"] = np.round(len(df_c_notna)/len(df[c]),2)
        if isinstance(df[c].dtype,(type(np.dtype("float64")),type(np.dtype("int64")))):
            df_sum.loc[c,"range"] = np.round([df_c_notna.min(),df_c_notna.max()],4)
            df_sum.loc[c,"mean"] = np.round(df_c_notna.mean(),4)
            df_sum.loc[c,"std"] = np.round(df_c_notna.std(),4)
    return df_sum


In [34]:

def compare(df1 : pd.DataFrame or pd.Series, df2: pd.DataFrame or pd.Series,numeric: bool=False) -> pd.DataFrame or pd.Series:
    """
    Compare the 2 DataFrames along the same indexes and columns. 
        - For numeric columns, return the difference. 
        - For object columns, return whether the values match
    """
    overlapped_index = overlap(df1.index,df2.index)
    overlapped_columns = overlap(df1.columns,df2.columns)
    index = df1.index if set(df1.index) == set(overlapped_index) else overlapped_index
    columns = df1.columns if set(df1.columns) == set(overlapped_columns) else overlapped_columns
    df1_comp = df1.loc[index,columns]
    df2_comp = df2.loc[index,columns]
    df_comp = pd.DataFrame(index=index,columns=columns)

    if numeric:
        numeric_cols = []
        for c in columns:
            if is_numeric(df1[c]) and is_numeric(df2[c]): 
                numeric_cols.append(c)
                df_comp.loc[index,c] = (df1_comp.loc[index,c] - df2_comp.loc[index,c])
        return df_comp.loc[index,numeric_cols]

    else:
        for c in columns:
            df_comp.loc[index,c] = (df1_comp.loc[index,c] == df2_comp.loc[index,c]) | ((df1_comp.loc[index,c] != df2_comp.loc[index,c]) 
                                                                                        & (df1_comp.loc[index,c].isna() == df2_comp.loc[index,c].isna())
                                                                                        & df1_comp.loc[index,c].isna() 
                                                                                        & df2_comp.loc[index,c].isna())
        return df_comp.loc[index,columns]

#----------------------------------------
# DATA FRAME MANIPULATION & TRANSFORMATION
#----------------------------------------

def filter(df: pd.DataFrame or pd.Series, condition: pd.DataFrame or pd.Series, filter_columns: bool=False) -> pd.DataFrame or pd.Series:
    """
    @Description: Filter a `DataFrame` object based on a match over one or multiple columns. Results can be filtered by rows or both rows and columns.
    @Parameters:
        - condition: boolean Series, typically a DataFrame expression involving one or more conditions. E.g., `df['A'] == 1` or `df[['B','C']] == [2,3]`
        - filter_columns: When filter_columns is False, only filter by rows, otherwise filter by both rows and columns
    """
    df_ = pd.DataFrame(df)
    filter_ = pd.DataFrame(condition)
    # Ensure that condition works over multiple columns matching
    match_all_columns = (pd.DataFrame(condition).sum(axis=1) == len(filter_.columns.values))
    condition =  match_all_columns if len(filter_.columns.values) >= 1 else condition 

    # Select columns to keep and rows to display based on condition
    columns_filt = filter_.columns.values if filter_columns else df_.columns.values
    index_filt = df_.loc[:,columns_filt][condition].dropna().index    
    return df_.loc[index_filt,columns_filt]

def per_class_sample(inputs: pd.DataFrame, targets: pd.DataFrame or pd.Series, 
                        sampling_dist: str or int or float or Iterable='min',random_state: int=None) -> Tuple[pd.DataFrame or pd.Series, pd.DataFrame or pd.Series]:
    """
    ### Description: 
    Sample inputs based on a distribution of target labels. By default will attempt to sample all classes equally according to the least populous class.

    ### Parameters:
    - sampling_dist:
        - If sampling_dist is `None`: Sample all classes .
        - If sampling_dist is `min`: Attempt to sample all classes equally according to the least populous class.
        - If sampling_dist is type `int`: Attempt to sample all classes up to a maximum of label_dists
        - If sampling_dist is type `float` (within (0,1)): Attempt to sample all classes each with the proportion of label_dists
        - If sampling_dist is type `list`: Attempt to sample classes based on the distribution specified.
            - If a class distribution is `None`, all members of that class is sampled
            - If a class distribution is a fraction (within (0,1)), it will be understood as the class proportion
            - If a class distribution is an integer (>=1), it will be understood as class counts
    """
    # Convert sampling_dist into list of distribution if not already is
    counts,labels,num_labels = label_counts(targets).values()
    if isNone(sampling_dist):
        sampling_dist = counts
    if isinstance(sampling_dist,str) & sampling_dist == 'min':
        sampling_dist = min(counts)
    if isinstance(sampling_dist,(int,float)):
        sampling_dist = np.full(shape = labels.shape,fill_value = sampling_dist)

    sampled_index = pd.Index([])
    for labels_i,counts_i, sampling_dist_i in zip(labels,counts,sampling_dist):
        # Convert distribution values to actual class counts
        if isNone(sampling_dist_i): 
            dist_i = int(counts_i)
        elif 0 <= sampling_dist_i and sampling_dist_i < 1:
            dist_i = int(counts_i * sampling_dist_i)
        else:
            dist_i = int(clamp(sampling_dist_i,0,counts_i))
        # Obtain samples with the labels_i   
        sampled_targets_i = filter(targets,targets==labels_i).sample(dist_i,random_state=random_state)
        sampled_index = sampled_index.append(sampled_targets_i.index)

    return inputs.loc[sampled_index], targets.loc[sampled_index]


def match(df_in: pd.DataFrame or pd.Series, oper: str ,values: Iterable[int],strict: bool=False) -> pd.DataFrame or pd.Series:
    """
    Apply a comparison operation to a list of values and return all results that matches all elements in the list.

    In strict mode, only keep rows that satisfy all matching requirements 
    """
    mult_result = (df_in != df_in) if oper == "==" else (df_in == df_in)
    for val in np.array(values):
        mult_result = {
            "<=": mult_result & (df_in <= val), "<"  : mult_result & (df_in < val),
            ">=": mult_result & (df_in >= val), ">"  : mult_result & (df_in > val),
            "==": mult_result | (df_in == val), "!=" : mult_result & (df_in != val),
        } [oper]
    df_out = df_in[mult_result].dropna(how={True:"any",False:"all"}[strict])
    return df_out


In [54]:
def as_1d_array(arr,dtype=None):
    # Check to see if the element is a ND Iterable that is also not a string
    is_nd_iter = isinstance(arr,Iterable) and len(np.shape(arr)) > 0 and not isinstance(arr,str)

    if is_nd_iter:
        arr_1d = np.ravel(arr)
        if isNone(dtype): 
            dtype = arr_1d.dtype
        return arr_1d.astype(dtype)
    else:
        if isNone(dtype): 
            dtype = type(arr)
        return np.array([arr]).astype(dtype)

def __test__():
    a1 = (1,2,3,4)
    a2 = "1,2,3,4"
    a3 = [[1,2,3],[4,5,6]]
    a4 = 1234
    a5 = list("abbabccbaab")
    print(f">> Regular iterable: {as_1d_array(a1)}")
    print(f">> String: {as_1d_array(a2)}")    
    print(f">> ND-array/list: {as_1d_array(a3)}")
    print(f">> Non-iterable {as_1d_array(a4)}")
    print(f">> Non numeric list  {as_1d_array(a5)}")

__test__()        

>> Regular iterable: [1 2 3 4]
>> String: ['1,2,3,4']
>> ND-array/list: [1 2 3 4 5 6]
>> Non-iterable [1234]
>> Non numeric list  ['a' 'b' 'b' 'a' 'b' 'c' 'c' 'b' 'a' 'a' 'b']


In [71]:
#----------------------------------------
#   DATA TYPES
#----------------------------------------

def is_dtypes(df: pd.DataFrame or pd.Series ,dtypes : str or Iterable) -> bool:
    """
    Check if a series is of one or multiple numpy dtypes
    - For `int` dtype, use `"int64"`
    - For `float` dtype, use  `"float64"`
    - For `object` dtype, use  `"O"`
    - For `boolean` dtype, use `"bool"`
    """

    if isinstance(df, pd.Series):
        return isinstance(df.dtypes,tuple([type(np.dtype(typ)) for typ in as_1d_array(dtypes)]))
    elif isinstance(df, Iterable):
        return isinstance(np.array(df).dtype,tuple([type(np.dtype(typ)) for typ in as_1d_array(dtypes)]))
    elif isinstance(df, pd.DataFrame):
        all_dtypes = True
        for c in df.columns:
            all_dtypes &= is_dtypes(df[c],dtypes) # Shallow recursion so should not affect performance
        return all_dtypes
    else:
        return False

def is_numeric(df: pd.DataFrame or pd.Series) -> bool:
    """
    Check if a series is of numeric numpy dtypes
    """
    return is_dtypes(df,("float64","int64"))


def __test__():
    np.random.seed(1)
    # Test DataFrame
    df_ones = pd.DataFrame(np.random.randint(2, size=(3,4)))
    print(df_ones)
    df_bool = df_ones == 1
    print(df_bool)
    print(f">> Should be True: {is_numeric(df_ones)}")
    print(f">> Should be False: {is_numeric(df_bool)}")

    sr_bool = df_bool[0]
    sr_ones = df_ones[0]
    print(sr_ones)
    print(f">> Should be True: {is_numeric(sr_ones)}")

    print(sr_bool)
    print(f">> Should be True: {is_dtypes(sr_bool,'bool')}")

    np_ones = sr_ones.values
    print(np_ones)
    print(f">> Should be True: {is_numeric(np_ones)}")

    list_ones = list(sr_ones)
    print(list_ones)
    print(f">> Should be True: {is_numeric(list_ones)}")
    print(f">> Should be False: {is_numeric(iter(list_ones))}") # Does not work with iter class
    print(f">> Should be True: {is_numeric(tuple(list_ones))}")

    # print(f">> Should be True: {is_bool(df_bool)}")


__test__()

   0  1  2  3
0  1  1  0  0
1  1  1  1  1
2  1  0  0  1
      0      1      2      3
0  True   True  False  False
1  True   True   True   True
2  True  False  False   True
>> Should be True: True
>> Should be False: False
0    1
1    1
2    1
Name: 0, dtype: int64
>> Should be True: True
0    True
1    True
2    True
Name: 0, dtype: bool
>> Should be True: True
[1 1 1]
>> Should be True: True
[1, 1, 1]
>> Should be True: True
>> Should be False: False
>> Should be True: True


## Summary Function

In [None]:
np.random.seed(1)
df_ones = pd.DataFrame(np.random.randint(2, size=(10,15)))
df_bool = df_ones == 1
df_ones

In [None]:
df_bool

In [None]:
def is_bool(df,binary_allowed=False):
    """
    Check if DataFrame, Series or Iterable is of dtypes boolean.  
    """
    # First check the dtype information about the object if available
    is_dtype_bool = is_dtypes(df,"bool") 

    # lso accept cases where the object is a binary matrix if binary_allowed
    can_check_count = binary_allowed or not is_numeric(df)  
    count_satisfied = False
    if isinstance(df,pd.DataFrame):
        count_satisfied = ((df == True)|(df == False)).sum().sum() == df.size
    elif isinstance(df,pd.Series):
        count_satisfied = ((df == True)|(df == False)).sum() == df.size
    elif isinstance(df,Iterable):
        arr = np.array(df,copy=True)
        np.place(arr,(arr == True)|(arr == False),True)
        count_satisfied = arr.sum() == len(arr)
    return is_dtype_bool or (can_check_count and count_satisfied) 
 
def __test__():
    np.random.seed(1)
    df_ones = pd.DataFrame(np.random.randint(4, size=(3,4)))
    print(df_ones)
    df_bool = df_ones == 1
    print(df_bool)
    # print(df_bool.dtypes)
    # print(is_dtypes(df_bool,"bool"))

    df_bool_obj = df_bool.astype(object)
    df_ones_obj = df_ones.astype(object)
    # print(df_bool_obj.dtypes)
    # print(is_dtypes(df_bool_obj,"bool"))
    # print(is_bool(df_bool_obj,"bool"))

    # Test DataFrame
    print(f">> Should be False: {is_bool(df_ones)}")
    print(f">> Should be True: {is_bool(df_bool)}")
    print(f">> Should be True: {is_bool(df_bool_obj)}")
    print(f">> Should be False: {is_bool(df_ones_obj)}")


    # Test Series
    sr_ones = df_ones[0]
    sr_bool = df_bool[0]
    print(f">> Should be False: {is_bool(sr_ones)}")
    print(f">> Should be True: {is_bool(sr_bool)}")
    # Test Iterable
    np_ones = sr_ones.values
    np_bool = sr_bool.values
    print(f">> Should be False: {is_bool(np_ones)}")
    print(f">> Should be True: {is_bool(np_bool)}")

    print(f">> Should be False: {is_bool(list(np_ones))}")
    print(f">> Should be True: {is_bool(list(np_bool))}")

__test__()

write and read dataframe

In [67]:
import os

def write_dataframe(df,write_to=None,**kwargs):
    """
    Write the dataframe according to the extension. Create the new path if necessary
    """
    if not isNone(write_to) and isinstance(write_to,str):
        directory = write_to[:len(write_to) - write_to[::-1].index("/")]
        if not os.path.isdir(directory):
            os.makedirs(directory)
        ext = write_to.split(".")[-1] # get the extension
        if ext == 'xlsx':
            df.to_excel(write_to,**kwargs)
        elif ext in ['tsv','tab']:
            df.to_csv(write_to,sep='\t',**kwargs)
        else:
            df.to_csv(write_to,**kwargs)
    return df


def read_dataframe(filename,index=None,unnamed_col=False,**kwargs):
    ext = filename.split(".")[-1] # get the extension
    if ext == 'xlsx':
        df = pd.read_excel(filename,**kwargs)
    elif ext in ['tsv','tab']:
        df = pd.read_csv(filename,sep='\t',**kwargs)
    else:
        df = pd.read_csv(filename,**kwargs)

    # Set index column if not None
    if not isNone(index) and index in df.columns:
        df = df.set_index(index)

    # Drop the typical 'Unnamed: 0' from the 
    UNNAMED = "Unnamed: 0"
    if UNNAMED in df.columns:
        if unnamed_col == True:
            pass
        elif unnamed_col == False:
            df = df.drop(columns=[UNNAMED])
        elif isinstance(unnamed_col,(str,int,float)):
            if unnamed_col in df.columns:
                unnamed_col = f"index: {unnamed_col}"
            df = df.rename(columns={UNNAMED:unnamed_col}) 

    return df

def __test__():
    print(read_dataframe("data/2x3x4.csv",unnamed_col=True))
    print(read_dataframe("data/2x3x4.csv",unnamed_col="index"))
    print(read_dataframe("data/2x3x4.csv",unnamed_col=False))
    print(read_dataframe("data/2x3x4.csv",unnamed_col=10))
    print(read_dataframe("data/2x3x4.csv",unnamed_col=3))
    print(read_dataframe("data/2x3x4.csv",unnamed_col='3'))

__test__()

   Unnamed: 0  0  1  2  3
0           0  0  1  1  0
1           1  1  1  1  1
2           2  1  1  1  0
   index  0  1  2  3
0      0  0  1  1  0
1      1  1  1  1  1
2      2  1  1  1  0
   0  1  2  3
0  0  1  1  0
1  1  1  1  1
2  1  1  1  0
   10  0  1  2  3
0   0  0  1  1  0
1   1  1  1  1  1
2   2  1  1  1  0
   3  0  1  2  3
0  0  0  1  1  0
1  1  1  1  1  1
2  2  1  1  1  0
   index: 3  0  1  2  3
0         0  0  1  1  0
1         1  1  1  1  1
2         2  1  1  1  0


In [57]:
isinstance(True,int)

True

In [19]:

def random_dataframe(low=2,high=None,size=None,random_state=None):
    np.random.seed(random_state)
    rd = np.random.randint(low,high,size)
    return pd.DataFrame(rd)

random_dataframe(2,size=(10,15))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1,0,0,1,0,1,1,1,1,1,0,1,1,1,0
1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1
2,0,0,1,1,0,0,0,1,0,1,0,0,1,1,1
3,1,1,0,0,0,1,0,0,0,1,0,1,1,0,1
4,0,1,1,0,0,1,0,1,1,1,0,1,1,0,0
5,1,1,0,0,0,0,0,1,1,1,1,0,1,1,1
6,1,1,0,1,0,1,0,0,0,1,1,1,0,0,1
7,0,1,1,1,0,0,0,0,0,1,1,0,1,0,0
8,1,1,0,0,1,1,1,0,0,1,0,0,0,1,1
9,0,1,0,1,1,1,0,0,1,1,1,1,0,0,0


In [40]:
df = random_dataframe(2,size=(3,4),random_state=0)
write_dataframe(df,"data/2x3x4.csv")

Unnamed: 0,0,1,2,3
0,0,1,1,0
1,1,1,1,1
2,1,1,1,0


In [41]:
df = random_dataframe(2,size=(3,4),random_state=1)
write_dataframe(df,"data/2x3x4_1.csv")

Unnamed: 0,0,1,2,3
0,1,1,0,0
1,1,1,1,1
2,1,0,0,1


In [51]:
read_dataframe("data/2x3x4.csv",unnamed_col=True)
read_dataframe("data/2x3x4.csv",unnamed_col=False)
read_dataframe("data/2x3x4.csv",unnamed_col="index")

Unnamed: 0,index,0,1,2,3
0,0,0,1,1,0
1,1,1,1,1,1
2,2,1,1,1,0
