## A concept for forecasting inflation with online listed item prices
Can we use online prices as a predictor of a nation's inflation and CPI?

Resources online indicates that not only is this possible, but it has been done the past decade by those who has the right tools - http://www.mit.edu/~afc/papers/Cavallo_Online_Offline.pdf

While it isn't entirely surprising that online and offline prices are similar.
finding the correct data points in an ever growing pool of resources is the challenge that this model attempts to answer.

Technolgy used: Jupyter Notebook, plot.ly, Python, d3js, Javascript

In [94]:
#Load Libraries for offline use

import os                                           # Fundamental file management libraries

import numpy as np                                  # Base Array library used by Pandas
import pandas as pd                                 # Pandas Matrix library

import scipy as sp                                  # Required as the baseline data science module

from StringIO import StringIO                       # Formulating a string as a filestream

import plotly.tools as tls                          # Auxiliary Tools

from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
                                                    # plotly for offline use (i.e. no service fee)
    
from plotly.graph_objs import *                     # Different chart types
init_notebook_mode()                                # Notebook 

import cufflinks as cf                              # Bridge from DataFrames to Plotly
cf.go_offline()                                     # Required to use plotly offline (no account required).

from sklearn import preprocessing                   # For natrix normalization

categorybasepath=os.getcwd()
if 'historical_price' not in categorybasepath:
    categorybasepath = categorybasepath +'/'+ 'willzjc.github.io/python/historical_price/ref/categories/'
else:
    categorybasepath = categorybasepath.split('historical_price')[0] + 'historical_price/ref/categories/'


categorybasepath=categorybasepath.replace('\\','/')
categories=list(set([x[0].replace(categorybasepath+'\\','').split('categories')[1].replace('/','').split('\\')[0].strip() 
                     for x in os.walk(categorybasepath)]))

categories = [c for c in categories if (
              'ipynb_checkpoints' not in c and
              'Misc' not in c and not 'archive' in c
    )]

print '\n'.join([x for x in categories if len(x)>0])



Alcohol
Food and non-alcoholic beverages
Furnitures
Clothing and footwear


###  Priming data into model
Nothing too complex, loading data into matrices

In [95]:
#Reads files
global category,basepath
def readfiles(categorybasepath):
    global category,basepath
    bol_recursive=False
    df=None
    global category
    category='Clothing and footwear'
    category='Food and non-alcoholic beverages'
    category='Furnitures'
    category='Alcohol'
    basepath=categorybasepath+'/'+category+'/'
    
    files=[]

    for str_dirname, lst_subdirs, lst_files in os.walk(basepath):
        if not bol_recursive:
            while len(lst_subdirs) >= 0:

                for file in lst_files:
                    if '.csv' in file and not 'corr' in file and not 'fileread' in file:
#                         print 'Reading:',file
                        with open(basepath+file,'rb') as f:
                            buffer=f.read()
                            files.append(buffer)
                            f.close()
                
                if len(lst_subdirs) > 0: 
                    lst_subdirs.pop()
                    if len(lst_subdirs)==0:
                        break
                else:
                    break
                    
    for f in files:

        headers = {}
        buffer=[]
        headermode = True
        for line in f.split('\n'):
            if headermode and 'Date,' in line:
                headermode=False
            elements=line.strip().split(',')
            if len(elements) < 3 and len(elements) > 1:         # filter out header info
                headers[elements[0].strip()]=elements[1].strip()
            else:
                linein=False
                if len(elements)>1 and not headermode and '0.00 USD,0.00 USD' not in line:
                    buffer.append(line)
                    linein=True
                # print f,linein,line

        # Read file stream CSV
        currentdf = pd.read_csv(StringIO('\n'.join(buffer)))

        # Replace Strings
        currentdf = currentdf.replace('\sUSD', '', regex=True).apply(pd.to_numeric, errors='ignore')
        try:
            if df==None:
                df=pd.DataFrame(columns=['date'])
                df['date'] = currentdf['Date']
        except Exception as e:
            # TODO
            do_nothing=True

        df[headers['Keywords']] = currentdf['Average Selling Price']
        df[headers['Keywords']+"_sales"] = currentdf['Total Sales']
        df[headers['Keywords']+"_weighting"] = currentdf['Total Sales'] /  currentdf['Average Selling Price']

    return df

df = readfiles(categorybasepath)
original_df=df.copy()

df_summary = df.copy()

print '============================'
df_summary=df_summary.sum()[[c for c in df_summary.columns if 'date' not in c]].astype(int)
print df_summary
# dfst=pd.DataFrame(columns=['category','avg_price'])

# for category in [c for c in df_summary.columns if (not 'weight' in c and not 'sales'  in c)]:
# #     value=df_summary.loc[df_summary[category]]
#     print df_summary[category].columns
#     print value
#     dfst.loc[len(dfst)]=[category,value]
    


beer                   31152
beer_sales            552702
beer_weighting         15251
merlot                 38097
merlot_sales          416699
merlot_weighting       12267
scotch                178546
scotch_sales         2161719
scotch_weighting       19252
shiraz                 74148
shiraz_sales         1642793
shiraz_weighting       23868
vodka                  65086
vodka_sales           598886
vodka_weighting         9632
whiskey                93782
whiskey_sales         717210
whiskey_weighting       9655
wine                   35849
wine_sales           3409314
wine_weighting         93145
dtype: int32


# First Plot - Seeing the data first hand

In [96]:
# Worked out mean and sum
# df['sum']=df[[x for x in df.columns if x not in ['date']]].sum(axis=1)   # Sum is useless at this stage given prices are not noralized

def getColumns(df,exclude_cols=None):
    if not exclude_cols==None:
        exclude_cols.append('date')
    else:
        exclude_cols=['date']
    return [x for x in df.columns if x not in exclude_cols]


pricecolumns = [c for c in df.columns if (not 'date' in c and not 'sales' in c and 'weight' not in c)]
salescolumns = [c for c in df.columns if (not 'date' in c and 'sales' in c and not 'weight' in c)]
weighcolumns = [c for c in df.columns if (not 'date' in c and not 'sales' in c and 'weight' in c)]

# df['mean']=df[pricecolumns].mean(axis=1)

print pricecolumns
print salescolumns
print weighcolumns

df.iplot(columns=pricecolumns,title="<b>Price of items</b><br>Outliers not removed - boundary and scaling problems")
df.iplot(columns=salescolumns,title="<b>Sales revenue of items</b><br>Outliers not filtered")
df.iplot(columns=weighcolumns,title="<b>Units Sold</b><br>Outliers not filtered")

['beer', 'merlot', 'scotch', 'shiraz', 'vodka', 'whiskey', 'wine']
['beer_sales', 'merlot_sales', 'scotch_sales', 'shiraz_sales', 'vodka_sales', 'whiskey_sales', 'wine_sales']
['beer_weighting', 'merlot_weighting', 'scotch_weighting', 'shiraz_weighting', 'vodka_weighting', 'whiskey_weighting', 'wine_weighting']


## Data Normalization
This section 
1. Normalizes data
2. Removes any outliters above a certain percentile
3. Smoothes out chart via more suitable bucketing interval

In [97]:
#### Percentiel threshold
percentile=0.95

for c in df.columns:
    if c not in ['date']:
        q = df[c].quantile(percentile)
        df[c] = df[df[c] < q][c]

df.iplot(y=getColumns(df),title='<b>Outliers Filtered</b><br>Period bucketing frequency unaltered')

# Resampling and interpolate

def plotinterpolate(df,columns=None):
    
    if not columns==None:
        if 'date' not in columns:
            columns.append('date')
            
    ndf=df.copy()[columns]
    
    indexer='date'                                # Only do this when the column date exists, set index column as date
    if indexer in df.columns:
        ndf['date']=pd.to_datetime(df['date'])
        ndf=ndf.set_index(pd.DatetimeIndex(df['date']))
    
    ndf = ndf.resample('M').mean()
    ndf = ndf.resample('D')
    tsint = ndf.interpolate(method='cubic')
    return tsint

# interpolate, spine-smooth, and then plot 

plotinterpolate(df,pricecolumns).iplot(title='<b>Price - Normalized data</b><br>Interpolated and bucketing interval set to 1 month')
plotinterpolate(df,salescolumns).iplot(title='<b>Revenue - Normalized data</b><br>Interpolated and bucketing interval set to 1 month')
plotinterpolate(df,weighcolumns).iplot(title='<b>Items Sold - Normalized data</b><br>Interpolated and bucketing interval set to 1 month')

## In this next step - we normalize all prices to be its own scale
Bias is given to those categories which had a higher volume
1. Average price of each category product accounted for
2. As well as being weighted how many items of each category of the product is sold

In [111]:
##### Make a copy of variables first
input_df = df.copy()
input_df = input_df.interpolate(method='linear', axis=0).ffill().bfill()

##### Normalization for price #######

# norm_columns = pricecolumns + weighcolumns             # Noramlize both price and weighting
norm_columns = []                                        # Noramlize both price and weighting

for c in (pricecolumns + weighcolumns):
    if not 'date' in c:
        norm_columns.append(c)
    
x = input_df[norm_columns].values                      # returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()          # Scaling
x_scaled = min_max_scaler.fit_transform(x)             # Fits curve
ndf = pd.DataFrame(x_scaled)


#### Normalized Values
ndf['date']=pd.to_datetime(input_df['date'])
ndf=ndf.set_index(pd.DatetimeIndex(input_df['date']))

#### Fixes up columns
ndf=ndf.drop(columns=['date'])
ndf.columns=(norm_columns)

### Assign weighting based on sales
for product in pricecolumns :
    if product not in 'mean':
        ndf[product]=ndf[product] * ndf[product + "_weighting"]    # Weighting Calculation
        ndf=ndf.drop(columns=[product + "_weighting"])

ndf['mean']=ndf[pricecolumns].mean(axis=1)
        
ndf=plotinterpolate(ndf)                                           # Interpolate first
fig = tls.make_subplots(rows=2, cols=1, shared_xaxes=True)         # Sub Plotting, specify how many charts

# for col in [c for c in getColumns(ndf) if c not in ['mean']]:
for col in pricecolumns :
    fig.append_trace({'x': ndf.index, 'y': ndf[col], 'type': 'scatter', 'name': col}, 1, 1)
for col in ['mean']:
    fig.append_trace({'x': ndf.index, 'y': ndf[col], 'type': 'bar', 'name': col}, 2, 1)


fig.layout.title='Weighted Normalized prices chart in comparison to mean price'
iplot(fig)


KeyError: 'date'

## CPI stats
Sourced from rba.gov.au, a csv file can be downloaded and used for our model's benchmark and reference

In [None]:
#Next is to get CPI
cpi=pd.read_csv(basepath+'AUCPI',delimiter='\t')
title='Obtain CPI for the next step, including each point\'s derivative'

cpi['diff'] = cpi.CPI.diff() # Calculating difference from previous year
cpi['diff'] = 100* cpi['diff'] / ((cpi['CPI'] + cpi['CPI'].shift(-1))/2)
cpi.index=cpi['date']

# Plotting two types of charts
def plotdouble(df,metric1,metric2,color1='orange',color2='green'):
    fig1 = df.iplot(columns=[metric1], kind='bar',asFigure=True,width=0.1,color=color1)   
    fig2 = df.iplot(columns=[metric2],  kind='line',secondary_y=[metric2], asFigure=True,colors=color2,width=5)
    fig2['data'].extend(fig1['data'])
    fig2.layout.title=title
    
    return fig2

iplot(plotdouble(cpi,'diff','CPI'))

In [None]:
# Interpolate
icpi=cpi.set_index(pd.DatetimeIndex(cpi.index))
icpi=icpi.resample('D').mean()
# icpi.CPI=icpi.CPI.resample('D').mean()
# icpi['CPI'].iplot()
# print icpi.interpolate()
# combined_df=pd.concat([cpi,df],axis=1)
# combined_df
# interpolation

icpi.CPI=icpi.CPI.interpolate(type='spline')
# icpi['CPI'].iplot(title='First interpolated spline for CPI')

icpi['diff'] = 0
icpi['diff'] = icpi.CPI.diff() # Calculating difference from previous year
icpi['diff'] = 100* icpi['diff'] / ((icpi['CPI'] + icpi['CPI'].shift(-1))/2)

# Filter out null
icpi=icpi[icpi['diff'].notnull()]

# sets index as the date for prices
# also time serializes dataframe so as to allow concatenation

#Either use prices or normalized prices
prices=df.set_index(pd.DatetimeIndex(df['date'])).drop(columns=['date'])
normalized_prices=ndf

#Combine Prices
cdf=pd.concat([icpi,normalized_prices],axis=1,join='inner')

# Get Doubleplot figure
fig=plotdouble(cdf,metric1='mean',metric2='diff')
fig.layout.title='Final Result'

mean_renamed='Price predictor'
cpi_renamed ='CPI %'
cdf=cdf.rename(columns = {'mean': mean_renamed,'diff':cpi_renamed})
fig_resampled=plotdouble(cdf.resample('M').mean(),metric1=cpi_renamed,metric2=mean_renamed)
fig_resampled.layout.title='<b>'+ category + '</b><br>'+'Mean Price and CPI - Monthly'

iplot(fig_resampled)




### Time shift
While the above looks correct, online prices are a lot more adaptive than RBA rates. 
Hence a timeshift is done to, each step is done per day, and the highest correlating shift is automatically chosen

In [None]:
def timeshift_series_plot(icpi,normalized_prices,offset,orientation=0,printoffset=True,sampling_rate='M',color1='orange',color2='blue'):
    if offset==0 :printoffset=False
        
    global mean_renamed
    global cpi_renamed
    # Time shift and filter out NaN values
    ref_icpi=icpi.copy()
    ref_normalized_prices=normalized_prices.copy()
    title='<b>'+ category + '</b><br>'+'Result - Monthly. '
    if orientation == 0:
        title=title+' CPI shift: '
        icpi=icpi.copy().shift(offset)
        icpi=icpi[icpi.CPI.notnull()]
        
        if printoffset: print ref_icpi.head(),'\nShifted:\n', icpi.head()

    else:
        title=title+' Price shift: '
        normalized_prices=normalized_prices.shift(offset)
        normalized_prices=normalized_prices[normalized_prices['mean'].notnull()]
        if printoffset: print ref_normalized_prices.head(),'\n\nShifted:\n', normalized_prices.head()
        

        
    #Combine Dataframes: CPI and Prices
    cdf=pd.concat([icpi,normalized_prices],axis=1,join='inner')
    
    # If we want to change sampling rate
    if not sampling_rate == 'M':
        cdf=cdf.resample(sampling_rate).mean().interpolate(kind='spine')    


    
#     print cdf
    corr=cdf['diff'].corr(cdf['mean'])   # Calculate Correlation
    title= title + str(offset) +' days. Correlation: '+str(round(corr,2))

    # Get Doubleplot figure
    fig=plotdouble(cdf,metric1='mean',metric2='diff')

    cdf=cdf.rename(columns = {'mean': mean_renamed,'diff':cpi_renamed})
    fig_resampled=plotdouble(cdf.resample(sampling_rate).mean(),metric1=cpi_renamed,metric2=mean_renamed,color1=color1,color2=color2)
    fig_resampled.layout.title=title
    
    iplot(fig_resampled)
    return fig_resampled


#find correlation of range
correlation_matrx=pd.DataFrame(columns=['offset','correlation'])
for i in range(-150,150):
    licpi=icpi.copy().shift(i)
    licpi=licpi[licpi.CPI.notnull()]
    

    cdf=pd.concat([licpi,normalized_prices],axis=1,join='inner')
    corr=cdf['diff'].corr(cdf['mean'])   # Calculate Correlation
    

    correlation_matrx.loc[len(correlation_matrx)]=[i,corr]

correlation_matrx.iplot(y='correlation',x='offset',title='<b>'+ category + '</b><br>'+'Days Offset Correlation Spectrum')


In [None]:
scdf=correlation_matrx.copy()
highest = scdf.loc[scdf['offset']> -388].loc[scdf['offset']<55].sort_values(['correlation'],ascending=False).head(n=1)
h_offset = highest['offset'].values[0].astype(int)

double_chart=timeshift_series_plot(icpi=icpi,normalized_prices=normalized_prices,offset=h_offset,orientation=0
                      ,sampling_rate='M'
                      ,color1='yellow'
                      ,color2='purple'
                     )
# print scdf
# for r in scdf.iterrows():
#     print r[1]['offset'],r[1]['correlation']


In [None]:
iplot(double_chart)