In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
from ast import literal_eval
import json
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn import metrics
from scipy.special import boxcox, inv_boxcox
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import zscore
import pickle
from sklearn.linear_model import LinearRegression
import math

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, LSTM, RepeatVector, TimeDistributed

%matplotlib inline
# plt.style.use('fivethirtyeight')

In [None]:
# Postgres username, password, and database name
POSTGRES_ADDRESS = 'localhost'

POSTGRES_PORT = '5432'
POSTGRES_USERNAME = 'vegas'
POSTGRES_PASSWORD = 'VrichCrich99'
POSTGRES_DBNAME = 'univers'

In [None]:
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'.format(username=POSTGRES_USERNAME, password=POSTGRES_PASSWORD, ipaddress=POSTGRES_ADDRESS, port=POSTGRES_PORT, dbname=POSTGRES_DBNAME))
cnx = create_engine(postgres_str)

<h2 style='color: red; font-size: 2em'>Fetch user data according to data count</h2>

In [None]:
appstech_labs_id = 1 # user id for fetching

In [None]:
# define a fetching function according to the appstech_labs_user data size
def fetch_data(_id, small_data_size, medium_data_size, large_data_size):
    return pd.read_sql_query(f"SELECT * FROM user_sales_table WHERE appstech_labs_id='{appstech_labs_id}'", cnx, index_col='txn_date', parse_dates=['txn_date'])  

In [None]:
df = fetch_data(appstech_labs_id, small_data_size=530234, medium_data_size=1530234, large_data_size=2203234)
df.index = pd.to_datetime(df.index)
df.head()

<h2 style='color: red; font-size: 2em'>Feature Engineering</h2>

In [None]:
def get_all_numeric_features(df):
    numeric_feats = df.dtypes[df.dtypes != 'object'].index
    numeric_feats = numeric_feats[1:]
    return numeric_feats

numeric_feats = get_all_numeric_features(df)
numeric_feats = numeric_feats[:-1]
numeric_feats

In [None]:
df.info()

In [None]:
cormat = df[numeric_feats].corr()
plt.subplots(figsize=(12, 9))
sns.heatmap(cormat, vmax=0.9, square=True, cmap='Greens')
plt.show()

<h2 style='color: red; font-size: 2em'>Feature Importance</h2>

In [None]:
# feature importance
def feature_importance(features, df, threshold, *args, **kwargs):
    '''args, kwargs pass True or include=True for additional features'''
    important_feature = list()
    cormat = df[features].corr()
    
    if kwargs.get('include') or args:
        if kwargs.get('include') or args[0]:
            include = ['qty', 'margin']
            for expt in include:
                important_feature.append(expt)
    
    for feat in features:
        if cormat[f"{feat}"][-3] > threshold:
            important_feature.append(feat)
            
    return important_feature


In [None]:
imp_features = feature_importance(numeric_feats, df, 0.69, False)
df_feats = df.copy()
df_feats = df_feats[imp_features]
df_feats.head()

In [None]:
cormat = df_feats[imp_features].corr()
plt.subplots(figsize=(12, 9))
sns.heatmap(cormat, vmax=0.9, square=True, cmap='Greens')
plt.show()

<h2 style='color: red; font-size: 2em'>Check For Null Values</h2>

In [None]:
def generate_null_values(df, labels, target, model):
    data_with_null = df[labels].dropna()
    data_without_null = data_with_null.dropna()
    
    # all features except target
    train_data_x = data_without_null.drop(target, axis=1).values
    train_data_y = data_without_null[f"{target}"].values.reshape(-1, 1)
    
    model.fit(X=train_data_x, y=train_data_y) # training the model

    test_data = data_with_null.drop(target, axis=1) #
    yhat = model.predict(test_data)
        
    newdf = pd.DataFrame(yhat, columns=[target])
    # replace only the null values
    data_with_null[target].fillna(newdf[target], inplace=True)
    
    return data_with_null[target]
    
    
def handle_missing_value(df, features, *args, **kwargs):
    linreg = LinearRegression()
    
    if kwargs.get('drop_column'):
        return df.drop(features, axis=1)
    
    for feat in features:
        null_count = df[feat].isnull().sum()
        
        if null_count > 1 and null_count < int(len(df[feat]) * 10 / 100): # full missing values with mean()
            if df[feat].dtype == 'object':
                df[feat] = df[feat].fillna(df[feat].mode())
            else:
                df[feat] = df[feat].fillna(df[feat].mean())
            
        elif null_count >= int(len(df[feat]) * 10 / 100) and null_count < int(len(df[feat]) * 20 / 100): # random generate missing values
            technique = [df[feat].fillna(df[feat].mean()), df[feat].fillna(df[feat].median()), df[feat].fillna(df[feat].mode())]
            index = np.random.choice([0, 1, 2], p=[0.34, 0.33, 0.33])
            df[feat] = technique[index]
            
        elif null_count >= int(len(df[feat]) * 20 / 100) and null_count < int(len(df[feat]) * 40 / 100): # predict missing values if selected else random generate
            if kwargs.get('use_model') or args:
                if kwargs.get('use_model') or args[0]:
                    generate_null = generate_null_values(df, features, feat, linreg)
                    df[feat] = generate_null[~generate_null.index.duplicated()]
            else:
                technique = [df[feat].fillna(df[feat].mean()), df[feat].fillna(df[feat].median()), df[feat].fillna(df[feat].mode())]
                index = np.random.choice([0, 1, 2], p=[0.34, 0.33, 0.33])
                df[feat] = technique[index]
        
        elif null_count >= int(len(df[feat]) * 40 / 100): # worst case senario
            df = df.drop(feat, axis=1)
    
    return df


In [None]:
df_feats = handle_missing_value(df_feats, imp_features, True)
df_feats.isnull().sum()

<h2 style='color: red; font-size: 2em'>Target Engineering</h2>

In [None]:
# handle outliers
def handleOutliers(df, threshold):    
    z = np.abs(zscore(df))
    
    return df[(z < threshold).all(axis=1)]

In [None]:
df_feats_outliers = handleOutliers(df_feats, 3)
print(df_feats_outliers.shape)
df_feats_outliers.head()

In [None]:
sns.distplot(df_feats_outliers['gross_amount'])
plt.show()
plt.hist(df_feats_outliers['gross_amount'])
plt.show()

In [None]:
df_feats_outliers.head()

In [None]:
new_df = df_feats_outliers.copy()
new_df = new_df

new_df = new_df.rename_axis('ds')
new_df = new_df.rename(columns={'gross_amount': 'y'})
new_df = new_df.drop(['sales_tax_amount', 'margin'], axis=1)
new_df = new_df.reset_index()
new_df.head()

In [None]:
new_df.shape

In [None]:
fb_train = new_df[:20]
fb_test = new_df[len(fb_train):]

print(fb_train.shape, fb_test.shape)

In [None]:
from fbprophet import Prophet

In [None]:
m = Prophet()
m.fit(fb_train)
future = m.make_future_dataframe(periods=len(fb_test))
forecast = m.predict(future)

In [None]:
# print(fb_test)
# forecast['yhat'][-len(fb_test):]

In [None]:
r2_score(fb_test['y'].values, forecast['yhat'][-len(fb_test):].values)

In [None]:
df_holidays_events = pd.read_csv("data/datasets_holidays_events.csv")
df_holidays_events.head()

In [None]:
df_holidays_events.shape

In [None]:
holidays = df_holidays_events[df_holidays_events['transferred'] == False][['description', 'date']]
holidays.columns = ['holiday', 'ds']
holidays.head()

In [None]:
m = Prophet(holidays=holidays)
# m.add_seasonality(name='monthyly', period=3, fourier_order=1)
m.fit(fb_train)
future = m.make_future_dataframe(periods=len(fb_test), freq="D", include_history=True)
forecast = m.predict(future)

In [None]:
r2_score(fb_test['y'].values, forecast['yhat'][-len(fb_test):].values)

In [None]:
# print(fb_test)
# forecast['yhat'][-len(fb_test):]

In [None]:
plt.plot(fb_test['y'].values)
plt.plot(forecast['yhat'][-len(fb_test):].values)
plt.show()

In [None]:
LAMBDA = 2.5 # learning rate for scipy stats.

def normalize_value(df, features, norm_type, boxcox):
    return {
        "log1p_skew": np.log1p(df[features]), # inverse np.exp
        "sqrt_skew": np.sqrt(df[features]), # inverse x**-1/2
#         "stats_skew": boxcox(df[features], LAMBDA) # inverse inv_boxcox(state, 2.5)
    }.get(norm_type)


def inverser_normalize_value(df, features, norm_type, *args, **kwargs):
    return {
        "log1p_skew": np.exp(df[features]), # inverse np.exp
        "sqrt_skew": df[features] * df[features], # inverse x**-1/2
#         "stats_skew": inv_boxcox(df[features], kwargs.get("lambda")) # inverse inv_boxcox(state, 2.5)
    }.get(norm_type)

In [None]:
# data normalization for

def normalize_data(df, features, *arg, **kwargs):
    new_df = pd.DataFrame(columns=['features', 'log1p_skew', 'sqrt_skew']) #, 'stats_skew'
    
    for feat in features:
        try:
            sqrt = np.sqrt(df[feat])
            log1p = np.log1p(df[feat])
#             stats = pd.Series(boxcox(df[feat], LAMBDA))
        
        except ValueError:
#             stats = np.NaN
            pass
    
        new_df = new_df.append({'features': feat, 'log1p_skew': log1p.skew(), 'log1p_': log1p.isna().sum(), 'sqrt_skew': sqrt.skew(), 'sqrt_': sqrt.isna().sum()}, ignore_index=True)
#     'stats_skew': stats.skew(), 'stats_': stats.isna().sum(),

    new_df = new_df.groupby(['features']).sum()
    if new_df['log1p_'].values.sum() > 0:
        new_df = new_df.drop(['log1p_', 'log1p_skew'], axis=1)
    if new_df['sqrt_'].values.sum() > 0:
        new_df = new_df.drop(['sqrt_', 'sqrt_skew'], axis=1)
#     if new_df['stats_'].values.sum() > 0:
#         new_df = new_df.drop(['stats_', 'stats_skew'], axis=1)
        
    if new_df.empty:
        return None, df
    print(new_df)
    arg_norm_score_obj = list()
    for i in range(new_df.shape[1]):
        arg_norm_score_obj.append({"name": f"{new_df.columns[i]}", "score": new_df[f"{new_df.columns[i]}"].sum()})
        
    NORM_TYPE = min(i.get('name') for i in arg_norm_score_obj if i.get('score') > 0) # global variable ->>
    
    norm_val = normalize_value(df, features, NORM_TYPE, boxcox)
    return NORM_TYPE, norm_val


In [None]:
NORM_TYPE, df_feats_norm = normalize_data(df_feats_outliers, ['gross_amount'])
print(NORM_TYPE)
df_feats_norm.head()

In [None]:
# house_price = pd.read_csv('data/train.csv')
# print(house_price.shape)
# house_price = handle_missing_value(house_price, house_price.columns, True)
# house_price_outliers = handleOutliers(house_price[house_price.dtypes[house_price.dtypes != "object"].index], 3)
# NORM_TYPE_1, house_pricing_norm = normalize_data(house_price_outliers, house_price_outliers.columns) # global variable for NORM_TYPE ->>
# print(NORM_TYPE_1)
# house_pricing_norm.head()

In [None]:
# horizontally stack columns
df_feats_norm = df_feats_norm.set_index(df_feats_norm.index).resample("D")[df_feats_norm.columns].sum()
series = df_feats_norm.values

In [None]:
series.shape

In [None]:
train, test = series[:-20], series[-4:]

In [None]:
# make a forecast
def forecast(model, history, n_input):
    # flatten data
    data = np.array(history)
#     data = data.reshape((data.shape[0]*data.shape[1], data.shape[2]))
    # retrieve last observationmetricsor input data
    input_x = data[-n_input:, :]
    # reshape into [1, n_input, n]
    input_x = input_x.reshape((1, input_x.shape[0], input_x.shape[1]))
    # forecast the next week
    yhat = model.predict(input_x, verbose=0)
    # we only want the vector forecast
    yhat = yhat[0]
    return yhat
 
# evaluate a single model
def evaluate_model(train, test, n_input):
    # history is a list of weekly data
    history = [x for x in train]
    # walk-forward validation over each week
    predictions = list()
    for i in range(len(test)):
        # predict the week
        yhat_sequence = forecast(model, history, n_input)
        # store the predictions
        predictions.append(yhat_sequence)
        # get real observation and add to history for predicting the next week
        history.append(test[i, :])
    # evaluate predictions days for each week
    predictions = np.array(predictions)
    
    return predictions

In [None]:
def to_supervised(dataset, n_input, n_out):
    # flatten data
    X, y = list(), list()

    in_start = 0
    # step over the entire history one time step at a time
    for _ in range(len(dataset)):
        # define the end of the input sequence
        in_end = in_start + n_input
        out_end = in_end + n_out

        if out_end <= len(dataset):
            X.append(dataset[in_start: in_end, :])
            y.append(dataset[in_end: out_end, -1])

        in_start += 1

    return np.array(X), np.array(y)

In [None]:
train_x, train_y = to_supervised(train, 1, 1)
train_y = train_y.reshape(-1, 1)
train_y = train_y.reshape((train_y.shape[0], train_y.shape[1], 1))

print(train_x.shape, train_y.shape)

In [None]:
# define model
model = Sequential()
model.add(LSTM(52, activation='relu', input_shape=(train_x.shape[1], train_x.shape[2])))
model.add(RepeatVector(train_y.shape[1]))

model.add(TimeDistributed(Dense(10, activation='relu')))
model.add(TimeDistributed(Dense(train_y.shape[1])))

model.compile(loss='mse', optimizer='adam')

# model.add(LSTM(50, activation='relu', input_shape=(train_x.shape[1], train_x.shape[2])))
# model.add(Dense(10))
# model.add(Dense(train_y.shape[1]))
# model.compile(optimizer='adam', loss='mse')

# fit network
model.fit(train_x, train_y, epochs=200, batch_size=17, verbose=1)

In [None]:
test_x, test_y = to_supervised(test, 1, 1)
test_y = test_y.reshape(-1, 1)
test_y = test_y.reshape((test_y.shape[0], test_y.shape[1], 1))

print(train_x.shape, train_y.shape)

predictions = evaluate_model(train, test, 1)
predictions = predictions.reshape(predictions.shape[0] * predictions.shape[1], predictions.shape[2])

r2_score(np.array(test[:, -1]), predictions)

In [None]:
for i in range(len(test)):
    print(np.array(test[i, -1]), predictions[i][0])

In [None]:
from statsmodels.tsa.arima_model import ARIMA, ARMAResults

In [None]:
def difference(dataset, interval=1):
    diff = list()
    for i in range(interval, len(dataset)):
        value = dataset[i] - dataset[i - interval]
        diff.append(value)
        
    return diff

def inverse_differece(history, yhat, interval=1):
    return yhat + history[-interval]

In [None]:
train_len = int(len(series) * 0.97)
train, test = series[:train_len], series[train_len:]

print(train.shape, test.shape)

In [None]:
# history = [x for x in train[:, -1]]

In [None]:
# evaluate an ARIMA model for a given order (p,d,q)
def evaluate_arima_model(X, arima_order):
    # prepare training dataset
    train_size = int(len(X) * 0.79)
    train, test = X[0:train_size], X[train_size:]
    history = [x for x in train]
    # make predictions
    predictions = list()
    for t in range(len(test)):
        model = ARIMA(history, order=arima_order)
        model_fit = model.fit(disp=0, solver='lbfgs')
        yhat = model_fit.forecast()[0]
        predictions.append(yhat)
        history.append(test[t])
    # calculate out of sample error
    error = mean_squared_error(test, predictions)
    
    predictions = np.array(predictions)
    test = np.array(test)
    
    print(r2_score(test, predictions))
    
    plt.plot(predictions, color='red')
    plt.plot(test)
    plt.show()
    
    return np.array(predictions)

In [None]:
prediction = evaluate_arima_model(series[:, -1], (4, 1, 0))

In [None]:
test

In [None]:
# evaluate combinations of p, d and q values for an ARIMA model
def evaluate_models(dataset, p_values, d_values, q_values):
    dataset = dataset.astype('float32')
    best_score, best_cfg = float("inf"), None
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p,d,q)
                try:
                    mse = evaluate_arima_model(dataset, order)
                    if mse < best_score:
                        best_score, best_cfg = mse, order
                    print('ARIMA%s MSE=%.3f' % (order,mse))
                except:
                    continue
    print('Best ARIMA%s MSE=%.3f' % (best_cfg, best_score))

In [None]:
import warnings
from time import time
warnings.filterwarnings('ignore')

# solvers
solvers = ['lbfgs', 'powell']

# evaluate parameters
p_values = [0, 1, 2, 4, 6, 8, 10]
d_values = range(0, 3)
q_values = range(0, 3)
warnings.filterwarnings("ignore")
# evaluate_models(series[:, -1], p_values, d_values, q_values)

In [None]:
4, 1, 0

In [None]:
history = [x for x in train[:, -1]]

In [None]:
mse = mean_squared_error(test[:, -1], prediction)
rmse = math.sqrt(mse)
print("RMSE: %.3f" % rmse)
print(r2_score(test[:, -1], prediction))
plt.plot(test[:, -1])
plt.plot(prediction, color="red")
plt.show()

In [None]:
residuals = [test[i, -1] - prediction[i] for i in range(len(test))]
residuals = pd.DataFrame(residuals)

residuals.head()

In [None]:
residuals.describe()

In [None]:
plt.figure()
plt.subplot(211)
residuals.hist(ax=plt.gca())
plt.subplot(212)
residuals.plot(kind='kde', ax=plt.gca())
plt.show()

In [None]:
model_fit.forecast()[0]