In [None]:
# Do not modify this cell
from base64 import b64decode
import json
source_config = json.loads(b64decode("<source_config>".encode("ascii")).decode("ascii"))
metadata = json.loads(b64decode("<metadata>".encode("ascii")).decode("ascii"))
print("Source Config: {}".format(source_config))
print("Input Tables MetaData: {}".format(metadata))
try:
    import pandas_profiling
except:
    !sudo /home/ec2-user/anaconda3/bin/conda update -n amazonei_tensorflow_p36 --all -y
    !sudo /home/ec2-user/anaconda3/bin/conda install -c conda-forge -n amazonei_tensorflow_p36 pandas-profiling imagehash -y
    !sudo /home/ec2-user/anaconda3/bin/conda update -n amazonei_tensorflow_p36 ipywidgets -y
finally:
    import pandas_profiling

In [None]:
import pandas as pd
import numpy as np

# Variables

### Input data's metadata -  (User input)

In [None]:
input_data_location = source_config['input_s3_dir']
TIMESTAMP_COLUMN_KEY = metadata['timestamp_column']
TARGET_VALUE_COLUMN_KEY = metadata['target_column']
ITEM_ID_COLUMN_KEY = metadata['item_id_column']
input_forecast_dimensions_column_keys = metadata['forecast_dimension_columns']
columns=metadata["columns"]

# Read Data

In [None]:
import s3fs
fs = s3fs.S3FileSystem()

li = []
for file in fs.ls(input_data_location):
    print("reading file : ",file)
    try:
        li.append(pd.read_csv("s3://{}".format(file)))
    except:
#         print('file {} is not readable'.format(file))
        pass
target_timeseries_df = pd.concat(li, axis=0, ignore_index=True)
target_timeseries_df.columns=metadata['columns']
target_timeseries_df.head()

In [None]:
target_timeseries_df.head()

# Exploration

- Time range
- Total datapoints
- Unique items
- Unique forecasts
- Ability to plot timeseries of a particular forecast dimesion
- Missing values for all features + target value 
- Statistics target value


### Time range

In [None]:
def explore_time_range(df):
    print("Time range: {} to {}".format(df[TIMESTAMP_COLUMN_KEY].min(), df[TIMESTAMP_COLUMN_KEY].max()))

### Total datapoints count

In [None]:
def explore_total_data_points(df):
    print("Total datapoints: {}".format(len(df)))

### Unique items count

In [None]:
def explore_unique_items(df):
    print("Unique items: {}".format(len(df["item_id"].unique())))

### Unique forecasts

In [None]:
def explore_unique_forecasts(df):
    forecast_dimensions = df[ITEM_ID_COLUMN_KEY].map(str)

    for key in input_forecast_dimensions_column_keys:
        forecast_dimensions += '_' + df[key].map(str)

    print("Unique forecasts: {}".format(len(forecast_dimensions.unique())))

### Check missing values

In [None]:
def check_missing_values(df):
    print("Missing values:")
    print(df[df.isna().any(axis=1)])

### Target variable stats

In [None]:
def target_variable_statistics(df):
    print("Target Variable min: {}".format(df[TARGET_VALUE_COLUMN_KEY].min()))
    print("Target Variable max: {}".format(df[TARGET_VALUE_COLUMN_KEY].max()))
    print("Target Variable mean: {}".format(df[TARGET_VALUE_COLUMN_KEY].mean()))
    print("Target Variable std dev: {}".format(df[TARGET_VALUE_COLUMN_KEY].std()))

### Check sparsity of target variable

In [None]:
%matplotlib inline
target_timeseries_df[TARGET_VALUE_COLUMN_KEY].hist(bins=150)

### Plot items wrt target value

In [None]:
def clip_time_period(df, start_time, end_time):
    if start_time:
        df = df[df[TIMESTAMP_COLUMN_KEY] >= start_time]
    if end_time:
        df = df[df[TIMESTAMP_COLUMN_KEY] <= end_time]
    
    return df

def clip_columns(df, threshold):
    columns = len(df.columns)
    if columns > threshold:
        print("Too many columns: {}. Truncating to {}".format(columns, threshold))
        df = df.iloc[:, : threshold]
    return df
    
def plot_items(df, columns, max_plots=20, start_time=None, end_time=None):
    df = clip_time_period(df, start_time, end_time)
    
    new_df = pd.pivot_table(df, values=TARGET_VALUE_COLUMN_KEY, index=TIMESTAMP_COLUMN_KEY, columns=columns, aggfunc=np.sum)
    
    new_df = clip_columns(new_df, max_plots)
    
    new_df.plot(subplots=True, grid=True, legend=True, figsize=(15, 1.5 * max_plots))

### Run explorations

In [None]:
def run_explorations(df):
    explore_time_range(df)
    print("------------------------")
    explore_total_data_points(df)
    print("------------------------")
    explore_unique_items(df)
    print("------------------------")
    check_missing_values(df)
    print("------------------------")
    explore_unique_forecasts(df)
    print("------------------------")
    target_variable_statistics(df)
    plot_items(df, columns=[ITEM_ID_COLUMN_KEY])

In [None]:
%%time

run_explorations(target_timeseries_df)
print("------------------------")

In [None]:
data_exploration_profile = pandas_profiling.ProfileReport(target_timeseries_df)
data_exploration_profile

In [None]:
#if you want to save the above data explorations report to your s3 bucket as html file,
#uncomment the code below and populate the required placeholders

# file_name = ""             #string placeholder
# s3_bucket_path = ""           #string placeholder

# file_name = "{}.html".format(file_name)
# data_exploration_profile.to_file(output_file=file_name)
# s3_client = boto3.client('s3')
# response = s3_client.upload_file(file_name, s3_bucket_path, file_name")
# print (response)