In [None]:
## Preload libraries
import numpy as np
import pandas as pd
from ipython import display as display
from ipython import pretty_display as pdisplay
from matplotlib import pyplot as plt

## Create RNG generator
rng = np.random.default_rng()
try:
    rng_integers = rng.integers
except AttributeError:
    rng_integers = rng.randint

## Preset numpy printing_options
np.set_printoptions(suppress=True)      # disable scientific notation
np.set_printoptions(edgeitems=6)        # show more elements

In [None]:
## numpy helper functions
def check_nan(npa):
    result = list(filter(lambda x: (x == True), np.isnan(npa)))
    if len(result) > 0:
        return True
    else:
        return False

def _rng_int_stream(low=0, high=100, count=10):
    """
        Return a stream of random integers
        low: the min integer to generate from pool (inclusive)
        high: the max integer to generate from pool (exclusive)
        count: the number of integers to generate
        method: how to provide the values back to the calling function
            - list: return a list() to the caller
            - yield: return values using the yield keyword
    """
    # coerce arguments to int()
    low, high, count = [int(x) for x in [low, high, count]]
    for _ in range(0,count):
        yield rng_integers(low,high)

def rng_int_iter(low=0, high=100, count=10):
    return list(_rng_int_stream(low, high, count))

def rng_int_gen(low=0, high=100, count=10):
    return _rng_int_stream(low, high, count)

def npa_details(npa):
    dic = {
        'type': type(npa),
        'ndim': npa.ndim,
        'shape': npa.shape,
        'size': npa.size,
        'dtype': npa.dtype,
        'itemsize': npa.itemsize,
        'nbytes': npa.nbytes,
        'data': npa.data,
    }
    for k,v in dic.items():
        print(f"The {k} of the numpy array is: {v}")
    print("print(np_array):")
    print(npa)
    print()
    return dic

def np2d_rows(npa):
    rows = [i for i in npa]
    for idx,item in enumerate(rows):
        print(f"Index({idx}): {item}")
    return rows

In [None]:
## pandas helper functions
def pandaColumns(pdf):
    dic = {idx:i for idx,i in enumerate(df.columns)}
    _ = {print(f"Index[{k}] => {v}") for k,v in dic.items()}
    return dic

# Pandas
Pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, built on top of the Python programming language. Pandas provides high performance, fast, easy to use data structures. Pandas is built on the numpy library.

## Differences from numpy
|PANDAS|NUMPY|
|---|---|
|Working with tabular data|Working with numeric data|
|Data frame and Series|Multidimensional Arrays|
|Pandas consumes more memory|Numpy is memory efficient|
|Pandas typically has a better performance when number of rows is 500K or more.|Numpy typically has a better performance when number of rows is 50K or less.|
|Indexing of the pandas series is very slow as compared to numpy arrays.|Indexing of numpy Arrays is very fast.|


In [None]:
import os

def get_file_list(starting_directory=None):
    assert starting_directory is not None
    final_list = list()
    files = os.listdir(starting_directory)
    for f in files:
        file_name = os.path.join(starting_directory, f)
        if os.path.isdir(file_name):
            final_list = final_list + get_file_list(file_name)
        else:
            final_list.append(file_name)
    return final_list

directory = "C:\\Users\\tbennett\\Desktop\\SEC595\\ISO Contents\\Exercises\\data\\Day 2\\Backblaze\\data_Q4_2020\\"
all_files = get_file_list(directory)
print(len(all_files))
print(all_files[:5], all_files[-5:])

Pandas provides the `read_csv()` convenience function on the class.

Lets load some csv data and view it

In [53]:
df = pd.read_csv(all_files[0])
methods = [
    '_repr_html_',
    '_repr_json_',
    '_repr_jpeg_',
    '_repr_png_',
    '_repr_svg_',
]
valid_opts = {}
for m in methods:
    print(f"Trying Bound Method: {m}")
    try:
        # Create a function pointer to the bound method
        a = getattr(df, m)
        # If we don't throw exception, then we can use it
        print(f"[*] Object has bound method {m}")
        valid_opts[m] = a
    except AttributeError:
        print(f"[!] Object does not have bound method {m}")
        pass

#for k,v in valid_opts.items():
#    print(k)
#    print(v())

pdisplay(pd)
#print(valid_opts)
#print(display(df))
#pandaColumns(df)

Trying Bound Method: _repr_html_
[*] Object has bound method _repr_html_
Trying Bound Method: _repr_json_
[!] Object does not have bound method _repr_json_
Trying Bound Method: _repr_jpeg_
[!] Object does not have bound method _repr_jpeg_
Trying Bound Method: _repr_png_
[!] Object does not have bound method _repr_png_
Trying Bound Method: _repr_svg_
[!] Object does not have bound method _repr_svg_


NameError: name 'pdisplay' is not defined