In [None]:
import pandas as pd
import sqlite3

# Read sqlite query results into a pandas DataFrame
# con = sqlite3.connect("amzn-products.db")
con = sqlite3.connect("products.db")
df = pd.read_sql_query("SELECT * from product ", con)

# Verify that result of SQL query is stored in the dataframe
print(df.head(5))

con.close()

### Data Cleaning
#### Trim spaces in all columns in the dataframe

In [None]:
# Trim spaces in all columns in the dataframe

def trim_all_columns(df):
    """
    Trim whitespace from ends of each value across all series in dataframe
    """
    trim_strings = lambda x: x.strip() if isinstance(x, str) else x
    return df.applymap(trim_strings)


# simple example of trimming whitespace from data elements
df = trim_all_columns(df)
print(df)

In [None]:
def clean_Revenue_Sales_Fees_Net_data(x):
    """ If the value is a string, then remove currency symbol, delimiters, and N.A.
    otherwise, the value is numeric and can be converted
    """
    if isinstance(x, str):
        return(x.replace('$', '').replace(',', '').replace('N.A.','0').replace('--','0').replace('< ','-'))
    return(x)

### Fixing Rank column

In [None]:
# Check Data Type in Rank
# See different data types in the same column.  

df['Rank'].apply(type)

In [None]:
# Check Data Type in Rank
# See different data types in the same column. 

df['Rank'].apply(type).value_counts()

In [None]:
# Clean data and convert to float data type
df['Rank'] = df['Rank'].apply(clean_Revenue_Sales_Fees_Net_data).astype('float')

In [None]:
# Check Data Type in Rank
# See float data types in the column. 

df['Rank'].apply(type).value_counts()

In [None]:
df.Rank

### Fixing Est_Monthly_Revenue, Est_Monthly_Sales, Fees, and Net columns

In [None]:
# Check Data Type in Est_Monthly_Revenue
# See different data types in the same column.  

df['Est_Monthly_Revenue'].apply(type)

In [None]:
# Check Data Type in Est_Monthly_Revenue
# See different data types in the same column. 

df['Est_Monthly_Revenue'].apply(type).value_counts()

In [None]:
# Clean data and convert to float data type
df['Price'] = df['Price'].apply(clean_Revenue_Sales_Fees_Net_data).astype('float')


In [None]:
# Clean data and convert to float data type
df['Est_Monthly_Revenue'] = df['Est_Monthly_Revenue'].apply(clean_Revenue_Sales_Fees_Net_data).astype('float')


In [None]:
# Clean data and convert to float data type
df['Est_Monthly_Sales'] = df['Est_Monthly_Sales'].apply(clean_Revenue_Sales_Fees_Net_data).astype('float')


In [None]:
# Clean data and convert to float data type
df['Fees'] = df['Fees'].apply(clean_Revenue_Sales_Fees_Net_data).astype('float')


In [None]:
# Clean data and convert to float data type
df['Net'] = df['Net'].apply(clean_Revenue_Sales_Fees_Net_data).astype('float')


In [None]:
# Check Data Type in Est_Monthly_Revenue
# See float data types in the column. 

df['Est_Monthly_Revenue'].apply(type).value_counts()

In [None]:
df.Est_Monthly_Revenue

In [None]:
import altair as alt
alt.data_transformers.disable_max_rows()


brush = alt.selection_interval()

points = alt.Chart(df).mark_circle().encode(
    x='Est_Monthly_Sales:Q',
    y='Est_Monthly_Revenue:Q',
    color=alt.condition(brush, 'Category:N', alt.value('lightgray')), 
    tooltip=['Sellers', 'LQS', 'Reviews', 'Rank', 'Fees', 'Net', 'Est_Monthly_Sales','Est_Monthly_Revenue', 'Category', 'Product_Name']
).properties(width=400, height=200).add_selection(
    brush
)

bars = alt.Chart(df).mark_bar().encode(
    y='Category:N',
    color='Category:N',
    x='count(Category):Q'
).properties(width=400, height=200).transform_filter(
    brush
)


plot_product_scatterchart =  points & bars

plot_product_scatterchart

In [None]:
plot_product_bar = alt.Chart(df).mark_bar().encode(
    x='LQS',
    y='Net:Q',
    color= 'Category:N', 
    tooltip=['Sellers', 'LQS', 'Reviews', 'Rank', 'Fees', 'Net', 'Est_Monthly_Sales','Est_Monthly_Revenue', 'Category', 'Product_Name']
).properties(width=400, height=200)

plot_product_bar

In [None]:
plot_product_line = alt.Chart(df).mark_line().encode(
    x='Rank',
    y='Reviews:Q',
    color= 'Category:N', 
    tooltip=['Sellers', 'LQS', 'Reviews', 'Rank', 'Fees', 'Net', 'Est_Monthly_Sales','Est_Monthly_Revenue', 'Category', 'Product_Name']
).properties(width=400, height=200)

plot_product_line

In [None]:
df['Date_First_Available'].apply(type)

In [None]:
alt.Chart(df).mark_bar().encode(
    x='year(Date_First_Available):T',
    y='Price',
    color='Category',    
    tooltip=['year(Date_First_Available)', 'Sellers', 'LQS', 'Reviews', 'Rank', 'Fees', 'Net', 'Est_Monthly_Sales','Est_Monthly_Revenue', 'Category', 'Product_Name']
).properties(
            height=180,
            width=500,
            ).interactive()

In [None]:
alt.Chart(df).mark_bar().encode(
    x='yearquartermonth(Date_First_Available):T',
    y='Price',
    color='Category',    
    tooltip=['yearquartermonth(Date_First_Available)', 'Sellers', 'LQS', 'Reviews', 'Rank', 'Fees', 'Net', 'Est_Monthly_Sales','Est_Monthly_Revenue', 'Category', 'Product_Name']
).properties(
            height=180,
            width=500,
            ).interactive()

In [None]:
df.shape

In [None]:
df.isnull().sum()


In [None]:
df.describe()