In [None]:
# Do not modify this cell
from base64 import b64decode
import json
source_config = json.loads(b64decode("<source_config>".encode("ascii")).decode("ascii"))
metadata = json.loads(b64decode("<metadata>".encode("ascii")).decode("ascii"))
print("Source Config: {}".format(source_config))
print("Input Tables MetaData: {}".format(metadata))
try:
    import pandas_profiling
except:
    !sudo /home/ec2-user/anaconda3/bin/conda update -n amazonei_tensorflow_p36 --all -y
    !sudo /home/ec2-user/anaconda3/bin/conda install -c conda-forge -n amazonei_tensorflow_p36 pandas-profiling imagehash -y
    !sudo /home/ec2-user/anaconda3/bin/conda update -n amazonei_tensorflow_p36 ipywidgets -y
finally:
    import pandas_profiling

In [None]:
import time
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

## Variables

In [None]:
input_data_location = source_config['input_s3_dir']

In [None]:
'''Column Description'''
def column_description(df):
    start_time = time.time()
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = (df.isnull() | df.isna()).sum().values
    summary['Uniques'] = df.nunique().values
    summary['Mean'] = df.mean().values
    summary['STD'] = df.std().values
    summary['Min'] = df.min().values
    summary['Max'] = df.max().values
    summary['First Value'] = df.iloc[0].values
    summary['Second Value'] = df.iloc[1].values
    summary['Third Value'] = df.iloc[2].values
    print(f'Completed in {time.time()-start_time} seconds..')
    return summary.round(3)

In [None]:
"""
Takes Dataframe and Threshold as input
Returns two set of columns based on cardinality 
"""
def seperate_categorical_cols(df, threshold):
    categorical_cols = df.select_dtypes(include='object')
    one_hot_cols = []
    other_cols = []
    for col in categorical_cols:
        print(col, df[col].nunique())
        if df[col].nunique() <= threshold:
            one_hot_cols.append(col)
        else :
            other_cols.append(col)
    return one_hot_cols, other_cols

In [None]:
import plotly.express as px
import matplotlib.pyplot as plt

In [None]:
plt.style.use('fivethirtyeight')

In [None]:
"""
Plot columns wrt Target columns
"""
def plot_columns(df, plot_cols, target_col='success'):
    %matplotlib inline
    
    num_cols = len(plot_cols)
    if num_cols > 3:
        print('Max 3 columns supported till now!!')
        return
    
    if num_cols == 1:
        col_name = plot_cols[0]
        df_array = [df[df[target_col]==success_val][col_name] for success_val in df[target_col].unique()]
        plt.hist(df_array, stacked=True, label=df[target_col].unique())
        plt.legend()
    if num_cols == 2:
        df.plot.scatter(x=plot_cols[0], y=plot_cols[1], c=target_col, colormap='viridis')
    if num_cols == 3:
        fig = px.scatter_3d(
            df,
            x=plot_cols[0],
            y=plot_cols[1],
            z=plot_cols[2],
            color=target_col
        )
        fig.show()
    

## Read Data

In [None]:
import s3fs
fs = s3fs.S3FileSystem()

li = []
for file in fs.ls(input_data_location):
    try:
        li.append(pd.read_csv("s3://{}".format(file)))
    except:
#         print('file {} is not readable'.format(file))
        pass
df = pd.concat(li, axis=0, ignore_index=True)
df.head()

## Explorations

In [None]:
%%time
column_description(df)

In [None]:
df = px.data.iris()

In [None]:
plot_columns(df, ['sepal_length'], 'species_id')

In [None]:
plot_columns(df, ['sepal_length', 'sepal_width'], 'species_id')

In [None]:
plot_columns(df, ['sepal_length', 'sepal_width', 'petal_width'], 'species')

In [None]:
data_exploration_profile = pandas_profiling.ProfileReport(df)
data_exploration_profile

In [None]:
#if you want to save the above data explorations report to your s3 bucket as html file,
#uncomment the code below and populate the required placeholders

# file_name = ""             #string placeholder
# s3_bucket_path = ""           #string placeholder

# file_name = "{}.html".format(file_name)
# data_exploration_profile.to_file(output_file=file_name)
# s3_client = boto3.client('s3')
# response = s3_client.upload_file(file_name, s3_bucket_path, file_name")
# print (response)