In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine

from sklearn.preprocessing import MinMaxScaler

from IPython.display import display

In [2]:
# mysql username, password, and database name
MYSQL_ADDRESS = '127.0.0.1'

MYSQL_PORT = '3306'
MYSQL_USERNAME = 'vegas'
MYSQL_PASSWORD = 'VrichCrich99'
MYSQL_DBNAME = 'univers'


mysql_str = ('mysql://{username}:{password}@{ipaddress}:{port}/{dbname}'.format(username=MYSQL_USERNAME, password=MYSQL_PASSWORD, ipaddress=MYSQL_ADDRESS, port=MYSQL_PORT, dbname=MYSQL_DBNAME))
engine = create_engine(mysql_str)

In [3]:
appstech_labs_id = 1
df = pd.read_sql_query(f"SELECT * FROM user_sales_table where business_id='{appstech_labs_id}'", engine, index_col='txn_date', parse_dates=['txn_date'])
df.head()

Unnamed: 0_level_0,Id,business_id,source,customer_name,customer_id,sales_type,gross_amount,gross_cost,margin,discount_amount,...,business_currency,business_currency_symbol,txn_currency,txn_currency_symbol,due_date,exchange_rate,active,created_on,updated_on,deleted_on
txn_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-08-31,9,1,quickbooks,Amy's Bird Sanctuary,1,INVOICE,100.0,0.0,100.0,0.0,...,USD,$,USD,$,2020-09-30,1.0,1,2020-11-13 14:17:04,2020-11-13 14:17:15,
2020-05-19,10,1,quickbooks,Bill's Windsurf Shop,2,INVOICE,175.0,0.0,175.0,0.0,...,USD,$,USD,$,2020-06-18,1.0,1,2020-11-13 14:17:04,2020-11-13 14:17:15,
2020-08-29,11,1,quickbooks,Dylan Sollfrank,6,SALE,338.0,0.0,338.0,0.0,...,USD,$,USD,$,NaT,1.0,1,2020-11-13 14:17:04,2020-11-13 14:17:15,
2020-08-22,12,1,quickbooks,Cool Cars,3,INVOICE,2194.48,28.0,2166.48,0.0,...,USD,$,USD,$,2020-09-21,1.0,1,2020-11-13 14:17:04,2020-11-13 14:17:15,
2020-08-25,13,1,quickbooks,55 Twin Lane,9,INVOICE,50.0,0.0,50.0,0.0,...,USD,$,USD,$,2020-09-24,1.0,1,2020-11-13 14:17:04,2020-11-13 14:17:15,


In [4]:
from sklearn.linear_model import LinearRegression
from scipy.stats import zscore


def handle_outliers(df, target_feat: str, use_zscore: bool, threshold: int):
    """handle_outliers A function to handle outliers

    Args:
    -----
        df (dataframe): dataframe
        target_feat (str): target feature
        use_zscore (bool): use zscore model to handle outliers
        threshold (int): outlier threshold

    Returns:
    -------
        dataframe: dataframe
    """
    if use_zscore:
        z = np.abs(zscore(df))
        return df[(z < threshold).all(axis=1)]

    upper_lim = df[target_feat].quantile(0.95)
    lower_lim = df[target_feat].quantile(0.05)

    return df[(df[target_feat] < upper_lim) & (df[target_feat] > lower_lim)]


def handle_missing_values(df, threshold: float):
    """handle_missing_values A function to handle missing values

    Args:
    -----
        df (dataframe): dataframe
        threshold (float): missing value threshold

    Returns:
    --------
        dataframe: dataframe
    """
    """ Drop all columns above 70% null values"""
    df = df[df.columns[df.isna().mean() < threshold]]

    """ Drop all rows above 70% null values"""
    df = df.loc[df.isna().mean(axis=1) < threshold]

    """check for null/nan values >> if not 0 then null/nan value exist"""
    if 0 not in df.isna().sum().values:
        """use linear regrestion model to full null/nan values with predicted values"""
        linear_reg = LinearRegression()
        df = generate_null_values(df, linear_reg)

    return df


def generate_null_values(df, linear_reg: LinearRegression):
    """generate_null_values A function to predict missing values

    Args:
    -----
        df (dataframe): dataframe
        linear_reg (LinearRegression): Linear Regression model instance

    Returns:
    --------
        dataframe: dataframe
    """
    na_cols = df.dtypes[df.isna().sum() > 0].index
    data_with_null = df

    if na_cols:
        data_without_null = data_with_null.dropna()

        for col in na_cols:
            train_data_x = data_without_null.drop(f"{col}", axis=1).values
            train_data_y = data_without_null[f"{col}"].values.reshape(-1, 1)

            linear_reg.fit(X=train_data_x, y=train_data_y)

            test_data = data_with_null.drop(f"{col}", axis=1).values
            yhat = pd.DataFrame(linear_reg.predict(test_data)[:, 0], columns=[f"{col}"])

            data_with_null[f"{col}"].fillna(yhat[f"{col}"], inplace=True)

    return data_with_null


def get_numeric_feats(df):
    """get_numeric_feats A function numeric features
    This function helps get all numberic features in our dataset

    Args:
    -----
        df (dataframe): dataframe

    Returns:
    --------
        list: a list of numeric features in our dataframe
    """
    numeric_feats = df.dtypes[df.dtypes != "object"].index
    numeric_feats = df[numeric_feats].dtypes[df.dtypes !=
                                             "datetime64[ns]"].index
    numeric_feats = df[numeric_feats].dtypes[df.dtypes != "int64"].index

    return numeric_feats


def get_important_feats(df, numeric_feats: list, target_feat: str, threshold: float):
    """get_important_feats A function to get important features
    This function helps get important features based on the {target_feat}

    Args:
    -----
        df (dataframe): dataframe
        numeric_feats (list): list of numeric features in df
        target_feat (str): target feature
        threshold (float): threshold importance for our dataset

    Returns:
    --------
        list: list of important features
    """
    imp_feats = list()
    cormat = df[numeric_feats].corr().fillna(0)

    """get target index for feature extraction"""
    target_idx = cormat.columns.get_loc(target_feat)

    for feat in numeric_feats:
        try:
            if cormat[feat][target_idx] > threshold:
                imp_feats.append(feat)
        except Exception:
            pass

    """reset imp_feat target position"""
    imp_feats.remove(target_feat)
    imp_feats.append(target_feat)

    return imp_feats

def resample_dataframe_period(df, freq: str):
    """resample_dataframe_period A function to help resample dataframe according to freq

    Args:
    -----
        df (dataframe): dataframe
        freq (str): frequency period

    Returns:
    -----
        dataframe: dataframe
    """
    switcher = {
        "daily": df.resample("D")[df.columns].sum(),
        "weekly": df.resample("W")[df.columns].sum(),
        "monthly": df.resample("M")[df.columns].sum(),
        "quarterly": df.resample("Q")[df.columns].sum()
    }
    df = switcher.get(freq)
    df = df.loc[(df != 0).any(axis=1)]

    return df

In [8]:
def process_data(df, freq, target_feat):
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Data can only be a pd.DataFrame instance")
        
    df = handle_missing_values(df, threshold=0.7)
    
    numeric_feats = get_numeric_feats(df=df)
    df = df[get_important_feats(df=df, numeric_feats=numeric_feats, target_feat=target_feat, threshold=0.2)]
    
    """sample sales records in freq"""
    df = resample_dataframe_period(df=df, freq=freq)

    if df.shape[0] > 10:
        df = handle_outliers(df=df, target_feat=target_feat, use_zscore=False, threshold=3)
        
    return df
    

def create_dataset(df, batch_size, freq, target_feat="gross_amount", enable_catch=False, catch_path=None, train_phase=False):
    df = process_data(df, freq, target_feat)
    
    
            
    display(df)

create_dataset(df, batch_size=5, freq="daily")

Unnamed: 0_level_0,gross_cost,margin,discount_amount,tax_amount,gross_amount
txn_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-05-19,0.0,391.5,0.0,10.5,391.5
2020-06-19,125.0,240.0,0.0,22.0,365.0
2020-06-26,81.0,75.0,0.0,0.0,156.0
2020-07-17,125.0,661.6,0.0,44.4,786.6
2020-07-26,0.0,79.6,0.0,6.4,79.6
2020-08-09,0.0,975.0,0.0,0.0,975.0
2020-08-12,125.0,375.0,0.0,40.0,500.0
2020-08-18,0.0,80.0,0.0,0.0,80.0
2020-08-20,140.0,135.0,30.5,0.0,275.0
2020-08-25,0.0,130.0,0.0,4.0,130.0
