# Statistical Characteristics

[Methods for describing a set of data](https://bookdown.org/thomas_pernet/Tuto/methods-for-describing-a-set-of-data.html)

In [1]:
'''
    WARNING CONTROL to display or ignore all warnings
'''
import warnings; warnings.simplefilter('default')     #switch betweeb 'default' and 'ignore'

''' Set debug flag to view extended error messages; else set it to False to turn off debugging mode '''
debug = True

In [2]:
''' Load data '''
import pandas as pd

file_dir = "../data/"
file_name= "activity_summary_by_procedure.csv"
file_path=file_dir+file_name
first_row_header = True

print("read data from %s" %file_path)
data_df = pd.read_csv(file_path)
print(data_df.head(5))

read data from ../data/activity_summary_by_procedure.csv
   Unnamed: 0  Procedure   Charges  Payments  Adjustments  Units  Net Effect  \
0         NaN      90621   6844.45  -2603.29     -2055.93    182     2185.23   
1         NaN      90633   3192.29  -1073.18      -628.35    451     1490.76   
2         NaN      90648     40.07      0.00        -0.06      8       40.01   
3         NaN      90651  29131.58 -11270.92     -1979.75    498    15880.91   
4         NaN      90670  29305.09  -9001.35     -4884.57    738    15419.17   

   % of Net Effect  
0           0.0337  
1           0.0230  
2           0.0006  
3           0.2447  
4           0.2376  


## Statistics of Numerical columns

### import wrangler data stats package
Actvivate and inherit the wrangler/DataStatistics package

__TODO__ enhance wrangler as a package to include all classes as subclasses
Then be able to

```from wrangler import stats as ds```

In [3]:
import sys
sys.path.insert(1, '../wrangler')
import stats as ds
import json

#import importlib

if debug:
    import importlib
    ds = importlib.reload(ds)

data_name = "activity_summary_by_procedure"
cls_stats = ds.DataStatistics(name=data_name)
print(dir(cls_stats))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_dist_types', '_distributions', '_dup_method', '_l_dup_methods', 'count_duplicates', 'count_nulls', 'fit_distributions', 'get_central_tendency', 'get_data_types', 'name', 'test_uniformity']


### Data Types
Returns all the variables and their data types

In [9]:
stat_dict = {}
stat_dict = cls_stats.get_data_types(data_df)
print(json.dumps(stat_dict,sort_keys=False, indent=4))

{
    "Data Types": {
        "Unnamed: 0": "float64",
        "Procedure": "int64",
        "Charges": "float64",
        "Payments": "float64",
        "Adjustments": "float64",
        "Units": "int64",
        "Net Effect": "float64",
        "% of Net Effect": "float64"
    }
}


### Count Nulls
Returns the number of Nulls (NaN) values for each Column


In [4]:
stat_dict = {}
stat_dict = cls_stats.count_nulls(data_df)
print(json.dumps(stat_dict,sort_keys=False, indent=4))

{
    "NaN Counts": {
        "Unnamed: 0": 18,
        "Procedure": 0,
        "Charges": 0,
        "Payments": 0,
        "Adjustments": 0,
        "Units": 0,
        "Net Effect": 0,
        "% of Net Effect": 0
    }
}


### Count Duplicate rows/columns
Returns the number of empty rows and columns. Empty rows or columns imply all values in the row or column are Null.

In [5]:
stat_dict = {}
_l_meth = ["rows","columns"]
stat_dict = cls_stats.count_duplicates(data_df,_l_meth)
print(json.dumps(stat_dict,sort_keys=False, indent=4))

rows
columns
{
    "Number of Duplicate Rows": 0,
    "Number of Duplicate columns": 0
}


### Central Tendency
Returns the mean, standard deviation, mode, and median for all numeric

In [6]:
stat_dict = {}
stat_dict = cls_stats.get_central_tendency(data_df)
print(json.dumps(stat_dict,sort_keys=False, indent=4))

{
    "Mode": {
        "Unnamed: 0": "Unnamed: 0",
        "Procedure": "Procedure",
        "Charges": "Charges",
        "Payments": "Payments",
        "Adjustments": "Adjustments",
        "Units": "Units",
        "Net Effect": "Net Effect",
        "% of Net Effect": "% of Net Effect"
    },
    "Median": {
        "Unnamed: 0": "Unnamed: 0",
        "Procedure": "Procedure",
        "Charges": "Charges",
        "Payments": "Payments",
        "Adjustments": "Adjustments",
        "Units": "Units",
        "Net Effect": "Net Effect",
        "% of Net Effect": "% of Net Effect"
    },
    "Mean": {
        "Unnamed: 0": NaN,
        "Procedure": 90689.2778,
        "Charges": 7101.5317,
        "Payments": -2564.7417,
        "Adjustments": -931.5139,
        "Units": 306.2222,
        "Net Effect": 3605.2761,
        "% of Net Effect": 0.0556
    },
    "Variance": {
        "Unnamed: 0": NaN,
        "Procedure": 1141.0359,
        "Charges": 82828197.7285,
        "Payments"

### Uniformity test
Returns the KT test outputs

__TODO__ define wat thep-value explains

In [7]:
stat_dict = {}
stat_dict = cls_stats.test_uniformity(data_df)
print(json.dumps(stat_dict,sort_keys=False, indent=4))

{
    "KS Test Statistic": {
        "Unnamed: 0": NaN,
        "Procedure": 1.0,
        "Charges": 1.0,
        "Payments": 0.8889,
        "Adjustments": 0.7778,
        "Units": 1.0,
        "Net Effect": 0.9444,
        "% of Net Effect": 0.5
    },
    "KS Test P-value": {
        "Unnamed: 0": NaN,
        "Procedure": 0.0,
        "Charges": 0.0,
        "Payments": 0.0,
        "Adjustments": 0.0,
        "Units": 0.0,
        "Net Effect": 0.0,
        "% of Net Effect": 0.0001
    }
}


### Distributions
Returns the set of goodness of fit indicators for the a set, popular, or all distributions

In [8]:
%%capture
import pandas as pd

stat_summary_df = pd.DataFrame([])
stat_summary_df = cls_stats.fit_distributions(data_df,["popular"])

In [9]:
print(stat_summary_df.head(5))

        distr     score  LLE           loc      scale                    arg  \
0    loggamma  0.007536  NaN  90678.882567  38.671839  (1.7782079782725533,)   
1  genextreme  0.007596  NaN  90681.305437  35.580005  (0.5034800886975362,)   
2    dweibull  0.007635  NaN  90690.812034  29.148464  (1.3874385606091502,)   
3           t  0.007791  NaN  90688.654684  37.015153   (602558314.3380122,)   
4        norm  0.007803  NaN  90689.277778  32.827505                     ()   

  column name  
0   Procedure  
1   Procedure  
2   Procedure  
3   Procedure  
4   Procedure  
