fitbit_exploration

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# JSON API
import requests
import json

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
from fbprophet import Prophet
import itertools as it
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
from statsmodels.tsa.ar_model import AR

from sklearn.model_selection import TimeSeriesSplit
from sklearn import metrics

import math


%matplotlib inline

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from prepare import data_prepped
from prepare import missing_values_col

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
df = pd.read_csv('test_fitbit.csv')
#df = pd.read_csv("saas.csv")

In [3]:
df.tail()

Unnamed: 0,date,calories_burned,steps,distance,floors,minutes_sedentary,minutes_lightly_active,minutes_fairly_active,minutes_very_active,activity_calories,month_set
220,2018-12-02,3957,13257,6.23,195,725,265,13,70,2305,8
221,2018-12-03,3660,12915,6.12,6,782,219,40,51,1993,8
222,2018-12-04,3384,13763,6.49,13,608,199,11,67,1663,8
223,2018-12-05,3670,13865,6.52,12,739,200,12,69,1968,8
224,2018-12-06,3669,14774,6.96,9,647,198,4,78,1962,8


In [None]:
def missing_values_col(df):
    """
    Write or use a previously written function to return the
    total missing values and the percent missing values by column.
    """
    null_count = df.isnull().sum()
    null_percentage = (null_count / df.shape[0]) * 100
    empty_count = pd.Series(((df == ' ') | (df == '')).sum())
    empty_percentage = (empty_count / df.shape[0]) * 100
    nan_count = pd.Series(((df == 'nan') | (df == 'NaN')).sum())
    nan_percentage = (nan_count / df.shape[0]) * 100
    return pd.DataFrame({'num_missing': null_count, 'missing_percentage': null_percentage,
                         'num_empty': empty_count, 'empty_percentage': empty_percentage,
                         'nan_count': nan_count, 'nan_percentage': nan_percentage})

missing_values_col(df)

In [None]:
def peekatdata(df):
    print("\n \n SHAPE:")
    print(df.shape)

    print("\n \n COLS:")
    print(df.columns)

    print("\n \n INFO:")
    print(df.info())

    print("\n \n Missing Values:")
    missing_vals = df.columns[df.isnull().any()]
    print(df.isnull().sum())

    print("\n \n DESCRIBE:")
    print(df.describe())

    print('\n \n HEAD:')
    print(df.head(5))

    print('\n \n TAIL:' )
    print(df.tail(5))

peekatdata(df)

In [None]:
def obj_to_nums(df):
    df[['calories_burned', 'steps','minutes_sedentary', 'activity_calories']] = df[['calories_burned', 'steps','minutes_sedentary','activity_calories']].astype(int)
    return df

df = obj_to_nums(df)
df.tail()

In [None]:
df.head(2)

In [None]:
def time_convert(df):
    datetime_format = '%Y %m %d'
    df['date'] = pd.to_datetime(df.date,format=datetime_format)
    return df

df = time_convert(df)
df

In [None]:
def new_index(df):
    df = df.set_index('date')
    return df

df = new_index(df)

In [None]:
print('\nColumn Names:')
print(df.columns)
print('\nTotal number of columns:')
print(len(df.columns))

In [None]:
df.describe()

In [None]:
#df = df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]

In [None]:
df.shape

In [None]:
df.tail()

In [None]:
monthly_resampled_data = df.calories_burned.resample('M').mean()

In [None]:
monthly_resampled_data

In [None]:
aggregation = 'sum'

In [None]:
train = df[:'2018-10-31'].calories_burned.resample('W').agg(aggregation)
test = df['2018-11-01':].calories_burned.resample('W').agg(aggregation)

In [None]:
train.shape

In [None]:
test.shape

In [None]:
print('Observations: %d' % (len(train.values) + len(test.values)))
print('Training: %d' % (len(train.values)))
print('Testing: %d' % (len(test.values)))

In [None]:
pd.concat([train.head(3), train.tail(3)])

In [None]:
pd.concat([test.head(3), test.tail(3)])

In [None]:
plt.plot(train)
plt.plot(test)
plt.show()

In [None]:
y_hat = pd.DataFrame(dict(actual=test))

y_hat['calorie_forecast'] = train.mean()
y_hat.tail()

In [None]:
y_hat.describe()

In [None]:
def plot_data_and_predictions(predictions, label):
    plt.figure(figsize=(10, 8))

    plt.plot(train,label='Train')
    plt.plot(test, label='Test')
    plt.plot(predictions, label=label, linewidth=5)

    plt.legend(loc='best')
    plt.show()


def evaluate(actual, predictions, output=True):
    mse = metrics.mean_squared_error(actual, predictions)
    rmse = math.sqrt(mse)

    if output:
        print('MSE:  {}'.format(mse))
        print('RMSE: {}'.format(rmse))
    else:
        return mse, rmse    

def plot_and_eval(predictions, actual=test, metric_fmt='{:.2f}', linewidth=4):
    if type(predictions) is not list:
        predictions = [predictions]

    plt.figure(figsize=(16, 8))
    plt.plot(train,label='Train')
    plt.plot(test, label='Test')

    for y_hat in predictions:
        mse, rmse = evaluate(actual, y_hat, output=False)        
        label = f'{y_hat.name}'
        if len(predictions) > 1:
            label = f'{label} -- MSE: {metric_fmt} RMSE: {metric_fmt}'.format(mse, rmse)
        plt.plot(y_hat, label=label, linewidth=linewidth)

    if len(predictions) == 1:
        label = f'{label} -- MSE: {metric_fmt} RMSE: {metric_fmt}'.format(mse, rmse)
        plt.title(label)

    plt.legend(loc='best')
    plt.show()   

In [None]:
plot_and_eval(y_hat.calorie_forecast)

In [None]:
periods = 224
y_hat['calorie_forecast'] = train.rolling(224).mean().iloc[:-1]

In [None]:
plot_and_eval(y_hat.calorie_forecast)

In [None]:
def drop_blank_charge(df):
    df2 = df.copy()
    df2 = df2[df2['total_charges'] != ' ']
    df2['total_charges'] = df2.total_charges.astype(float)
    return df2