# Statistical Characteristics

[Methods for describing a set of data](https://bookdown.org/thomas_pernet/Tuto/methods-for-describing-a-set-of-data.html)

In [2]:
'''
    WARNING CONTROL to display or ignore all warnings
'''
import warnings; warnings.simplefilter('default')     #switch betweeb 'default' and 'ignore'

''' Set debug flag to view extended error messages; else set it to False to turn off debugging mode '''
debug = True

In [3]:
''' Load data '''
import pandas as pd

#specify file location
file_dir = "/Users/thiaradealwis/Desktop/Workspace/rezgate/sample_data/"
#specify file name
file_name= "covid.csv"
#specify file path
file_path=file_dir+file_name
#sets the first row of the dataset as the header 
first_row_header = True

print("read data from %s" %file_path)

#reads the dataset from the csv file and prints the first 5 rows of data
data_df = pd.read_csv(file_path)
print(data_df.head(5))

read data from /Users/thiaradealwis/Desktop/Workspace/rezgate/sample_data/covid.csv
      dateRep  day  month  year  cases  deaths countriesAndTerritories geoId  \
0  24/10/2020   24     10  2020     61       2             Afghanistan    AF   
1  23/10/2020   23     10  2020    116       4             Afghanistan    AF   
2  22/10/2020   22     10  2020    135       2             Afghanistan    AF   
3  21/10/2020   21     10  2020     88       2             Afghanistan    AF   
4  20/10/2020   20     10  2020     87       5             Afghanistan    AF   

  countryterritoryCode  popData2019 continentExp  \
0                  AFG   38041757.0         Asia   
1                  AFG   38041757.0         Asia   
2                  AFG   38041757.0         Asia   
3                  AFG   38041757.0         Asia   
4                  AFG   38041757.0         Asia   

   Cumulative_number_for_14_days_of_COVID-19_cases_per_100000  
0                                           2.586631      

## Statistics of Numerical columns

### import wrangler data stats package
Actvivate and inherit the wrangler/DataStatistics package

__TODO__ enhance wrangler as a package to include all classes as subclasses
Then be able to

```from wrangler import stats as ds```

In [4]:
import sys
sys.path.insert(1, '../wrangler')
import stats as ds
import json

#import importlib

if debug:
    import importlib
    ds = importlib.reload(ds)

data_name = "covid stats"
cls_stats = ds.DataStatistics(name=data_name)
print(dir(cls_stats))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_dist_types', '_distributions', '_dup_method', '_l_dup_methods', 'count_duplicates', 'count_nulls', 'fit_distributions', 'get_central_tendency', 'get_data_types', 'name', 'test_uniformity']


### Data Types
Returns all the variables and their data types

In [5]:
stat_dict = {}
stat_dict = cls_stats.get_data_types(data_df)
#uses wrangler libraries to return the data types of all variables
print(json.dumps(stat_dict,sort_keys=False, indent=4))

{
    "Data Types": {
        "dateRep": "object",
        "day": "int64",
        "month": "int64",
        "year": "int64",
        "cases": "int64",
        "deaths": "int64",
        "countriesAndTerritories": "object",
        "geoId": "object",
        "countryterritoryCode": "object",
        "popData2019": "float64",
        "continentExp": "object",
        "Cumulative_number_for_14_days_of_COVID-19_cases_per_100000": "float64"
    }
}


### Count Nulls
Returns the number of Nulls (NaN) values for each Column


In [6]:
stat_dict = {}
stat_dict = cls_stats.count_nulls(data_df)
#uses Wrangler libraries to return number of null values - these would have to be taken into account in analysis if present
print(json.dumps(stat_dict,sort_keys=False, indent=4))

{
    "NaN Counts": {
        "dateRep": 0,
        "day": 0,
        "month": 0,
        "year": 0,
        "cases": 0,
        "deaths": 0,
        "countriesAndTerritories": 0,
        "geoId": 224,
        "countryterritoryCode": 64,
        "popData2019": 64,
        "continentExp": 0,
        "Cumulative_number_for_14_days_of_COVID-19_cases_per_100000": 2790
    }
}


### Count Duplicate rows/columns
Returns the number of empty rows and columns. Empty rows or columns imply all values in the row or column are Null.

In [7]:
stat_dict = {}
_l_meth = ["rows","columns"]
stat_dict = cls_stats.count_duplicates(data_df,_l_meth)
#Uses Wrangler libraries to count duplicate rows/columns - these may be cleaned from the dataset before analysis
print(json.dumps(stat_dict,sort_keys=False, indent=4))

rows
columns
{
    "Number of Duplicate Rows": 0,
    "Number of Duplicate columns": 0
}


### Central Tendency
Returns the mean, standard deviation, mode, and median for all numeric

In [20]:
stat_dict = {}
stat_dict = cls_stats.get_central_tendency(data_df)
#returns mean variance and standard deviation for variables where these calculations are possible
print(json.dumps({key:stat_dict[key] for key in ["Mean", "Variance", "Standard Deviation"]},sort_keys=False, indent=4))

{
    "Mean": {
        "dateRep": 15.7593,
        "day": 6.2029,
        "month": 2019.9987,
        "year": 828.3193,
        "cases": 22.4387,
        "deaths": 41976917.4098,
        "countriesAndTerritories": 41.2137
    },
    "Variance": {
        "dateRep": 76.0458,
        "day": 6.2298,
        "month": 0.0013,
        "year": 23035342.0977,
        "cases": 15669.1221,
        "deaths": 2.4220148296667096e+16,
        "countriesAndTerritories": 9215.2935
    },
    "Standard Deviation": {
        "dateRep": 8.7204,
        "day": 2.496,
        "month": 0.0362,
        "year": 4799.5148,
        "cases": 125.1764,
        "deaths": 155628237.4657,
        "countriesAndTerritories": 95.9963
    }
}


### Uniformity test
Returns the KT test outputs

__TODO__ define wat thep-value explains

In [9]:
stat_dict = {}
stat_dict = cls_stats.test_uniformity(data_df)
print(json.dumps(stat_dict,sort_keys=False, indent=4))

{
    "KS Test Statistic": {
        "day": 0.9449,
        "month": 0.9367,
        "year": 1.0,
        "cases": 0.6064,
        "deaths": 0.4998,
        "popData2019": NaN,
        "Cumulative_number_for_14_days_of_COVID-19_cases_per_100000": NaN
    },
    "KS Test P-value": {
        "day": 0.0,
        "month": 0.0,
        "year": 0.0,
        "cases": 0.0,
        "deaths": 0.0,
        "popData2019": NaN,
        "Cumulative_number_for_14_days_of_COVID-19_cases_per_100000": NaN
    }
}


### Distributions
Returns the set of goodness of fit indicators for the a set, popular, or all distributions

In [21]:
%%capture
import pandas as pd

stat_summary_df = pd.DataFrame([])
stat_summary_df = cls_stats.fit_distributions(data_df,["popular"])

In [22]:
print(stat_summary_df.head(5))

        distr     score  LLE          loc      scale  \
0     uniform  0.034731  NaN          1.0       30.0   
1        beta   0.03638  NaN     0.365867  30.634133   
2  genextreme  0.041097  NaN    13.286215   9.114977   
3    dweibull  0.041849  NaN    15.512784   8.348698   
4    loggamma  0.041854  NaN -1092.505631   182.3433   

                                       arg column name  
0                                       ()         day  
1  (0.980385889563673, 0.8400162122089201)         day  
2                   (0.42385266617666784,)         day  
3                    (1.6384588651708496,)         day  
4                    (436.60984085206525,)         day  
