In [23]:
# Import the libraries for the data quality step
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import ipywidgets as widgets
from ipywidgets import interactive
from ipywidgets import Layout, FileUpload
from IPython.display import Markdown, display
import io
import random
import seaborn as sns

def printmd(string):
    display(Markdown(string))


# Data collection - upload from computer

In [24]:
# ----- For testing purposes ------
file_path = "./OA2_PM25_2013.csv"
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,OA11CD,LAD11NM,PM252013me
0,E00024024,Westminster,17.953073
1,E00023833,Westminster,18.183844
2,E00023830,Westminster,18.732058
3,E00023831,Westminster,17.909357
4,E00024021,Westminster,17.185809


In [None]:
# Shows the upload widget
upload = FileUpload(accept='.csv')
upload


In [7]:
try:
    uploaded_filename = next(iter(upload.value))
    content = upload.value[uploaded_filename]['content']
    data = pd.read_csv(io.BytesIO(content), header=0, escapechar='\\', encoding= 'unicode_escape')
except StopIteration:
    printmd("## Please enter a valid file.**")

NameError: name 'upload' is not defined

# Values distribution for a specific column

In [15]:
# Unique values for each column
df_features = pd.DataFrame(index = data.columns)
df_features['Total_Values'] = list(map(lambda x: data[x].count(), data.columns))
df_features['Unique_Values'] = list(map(lambda x: data[x].value_counts().count(), data.columns))
df_features['Most_Frequent_Value'] = list(map(lambda x: data[x].value_counts().sort_values(ascending = False).index[0], data.columns))
df_features['Most_Frequent_Value_Count'] = list(map(lambda x: data[x].value_counts().sort_values(ascending = False).iloc[0], data.columns))
df_features['Type_Data'] = list(map(lambda x: data[x].dtype, data.columns))

def compute_values_dist_column(column_name):
    plt.figure(figsize=(20,5))
    if df_features['Unique_Values'][column_name] > 100:
         fig = px.histogram(data, x=column_name,
            barmode="group",
            title="Values distribution for column " + column_name)
    else:
         fig = px.histogram(data, y=column_name,
            barmode="group",
            title="Values distribution for column " + column_name)
    fig.show()

w = widgets.Dropdown(options = data.columns, description="Column: ", value = None)
def on_change(change):
    plt.clf()
    if change['type'] == 'change' and change['name'] == 'value':
        print(" ---- Values distribution histogram for %s:" % change['new'])
        compute_values_dist_column(change['new'])
        
w.observe(on_change)

display(w)

Dropdown(description='Column: ', options=('OA11CD', 'LAD11NM', 'PM252013me'), value=None)

 ---- Values distribution histogram for PM252013me:


<Figure size 432x288 with 0 Axes>

<Figure size 1440x360 with 0 Axes>

 ---- Values distribution histogram for LAD11NM:


<Figure size 432x288 with 0 Axes>

<Figure size 1440x360 with 0 Axes>

 ---- Values distribution histogram for OA11CD:


<Figure size 432x288 with 0 Axes>

<Figure size 1440x360 with 0 Axes>

#  Unique values for a specific column

In [25]:
def get_unique_values(req_data, req_column):
    return pd.Series(req_data[req_column].value_counts())

def unique_values_dist_column(unique_values, column_name):
    try:
        plt.figure(figsize=(20,5))
        fig = px.bar(unique_values, x=column_name,
                           color_discrete_sequence =['#d595e8'],
                           barmode="group",
                           title="Unique values for column " + column_name)
        fig.show()
    except KeyError:
        printmd("## Please select a column.")
    except IndexError:
        printmd("## Please select a column.")
        
def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        unique_values = get_unique_values(data, change['new']).to_frame()
        unique_values.iloc[:, ::-1]
        unique_values_dist_column(unique_values, change['new'])

w = widgets.Dropdown(options = data.columns, description="Column: ", value = None)
w.observe(on_change)

display(w)

Dropdown(description='Column: ', options=('OA11CD', 'LAD11NM', 'PM252013me'), value=None)

           OA11CD
E00020319       2
E00020309       2
E00020476       2
E00019886       2
E00020372       2
...           ...
E00004296       1
E00010181       1
E00176527       1
E00003590       1
E00002555       1

[25053 rows x 1 columns]


<Figure size 1440x360 with 0 Axes>

                        LAD11NM
Southwark                  1786
Croydon                    1132
Barnet                     1036
Bromley                    1020
Wandsworth                  982
Lambeth                     966
Ealing                      956
Enfield                     905
Lewisham                    887
Brent                       829
Newham                      810
Greenwich                   809
Hillingdon                  789
Westminster                 783
Redbridge                   776
Havering                    754
Haringey                    753
Camden                      749
Hackney                     749
Tower Hamlets               748
Waltham Forest              732
Bexley                      728
Hounslow                    714
Islington                   697
Merton                      643
Harrow                      642
Kensington and Chelsea      631
Hammersmith and Fulham      625
Sutton                      616
Richmond upon Thames        615
Barking 

<Figure size 1440x360 with 0 Axes>