## Introduction
In this notebook, we aim to predict how a brand will rank on the interbrand ranking 3 years later based on its past 5 years ranking.

The following files are needed to run the codes:

1. the interbrand ranking by year json: `interbrand_brand2rankvalue.json` (on github)
2. `interbrand_brand2freq.json` (on github)

In [119]:
# import packages
import matplotlib
#matplotlib.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np 
import scipy as sp
import scipy.spatial
import scipy.linalg
import json
import pandas as pd
from sklearn.linear_model import LinearRegression

In [103]:
## Create dataset

print('read files')

with open('interbrand_brand2freq.json') as f:
    brand2year2freq = json.load(f)

with open('interbrand_brand2rankvalue.json') as f:
    brand2year2rankvalue = json.load(f)

brands = sorted(list(brand2year2freq))
brands = [b for b in brands]

Ts = range(2005,2017)
rank_if_not_list = 101
n_feature = 14

dataset = []
header = ['brand', 'base_year']  + \
['rank_t-4','rank_t-3', 'rank_t-2', 'rank_t-1', 'rank_t','rank_t+3','isonlist_t-4','isonlist_t-3','isonlist_t-2','isonlist_t-1','isonlist_t','isonlist_t+3',]

# if not on the list, rank=101
for brand in brands:
    for t in Ts:
        rank_tb4 = brand2year2rankvalue[brand].get(str(t-4), (rank_if_not_list,))[0]
        rank_tb3 = brand2year2rankvalue[brand].get(str(t-3), (rank_if_not_list,))[0] 
        rank_tb2 = brand2year2rankvalue[brand].get(str(t-2), (rank_if_not_list,))[0]
        rank_tb1 = brand2year2rankvalue[brand].get(str(t-1), (rank_if_not_list,))[0]
        rank_t = brand2year2rankvalue[brand].get(str(t), (rank_if_not_list,))[0]
        rank_tf3 = brand2year2rankvalue[brand].get(str(t+3), (rank_if_not_list,))[0]

        ison_tb4 = rank_tb4 < rank_if_not_list
        ison_tb3 = rank_tb3 < rank_if_not_list
        ison_tb2 = rank_tb2 < rank_if_not_list
        ison_tb1 = rank_tb1 < rank_if_not_list
        ison_t = rank_t < rank_if_not_list
        ison_tf3 = rank_tf3 < rank_if_not_list

        current_row = [brand, t] + \
        [rank_tb4, rank_tb3, rank_tb2, rank_tb1, rank_t, rank_tf3, ison_tb4, ison_tb3, ison_tb2, ison_tb1, ison_t, ison_tf3]
        current_row = [str(c) for c in current_row]
        assert(len(current_row)==n_feature)
        dataset.append(current_row)


dataset = [','.join(row)+'\n' for row in dataset]
dataset = [','.join(header)+'\n'] + dataset
with open('interbrand_dataset.csv', 'w') as f:
    for row in dataset:
        f.write(row)
print('done')

read files
done


In [104]:
df = pd.read_csv('interbrand_dataset.csv',encoding='latin1')
df

Unnamed: 0,brand,base_year,rank_t-4,rank_t-3,rank_t-2,rank_t-1,rank_t,rank_t+3,isonlist_t-4,isonlist_t-3,isonlist_t-2,isonlist_t-1,isonlist_t,isonlist_t+3
0,3m,2005,101,99,101,101,101,101,False,True,False,False,False,False
1,3m,2006,99,101,101,101,101,101,True,False,False,False,False,False
2,3m,2007,101,101,101,101,101,90,False,False,False,False,False,True
3,3m,2008,101,101,101,101,101,85,False,False,False,False,False,True
4,3m,2009,101,101,101,101,101,77,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1807,zurich,2012,101,101,94,94,101,101,False,False,True,True,False,False
1808,zurich,2013,101,94,94,101,101,101,False,True,True,False,False,False
1809,zurich,2014,94,94,101,101,101,101,True,True,False,False,False,False
1810,zurich,2015,94,101,101,101,101,101,True,False,False,False,False,False


In [105]:
# consider only brands that having all ranking data in the given period

selected = df[df['isonlist_t-4'] & df['isonlist_t-3'] & df['isonlist_t-2'] & df['isonlist_t-1'] & df['isonlist_t']& df['isonlist_t+3']]
df = selected[['brand', 'base_year', 'rank_t-4', 'rank_t-3', 'rank_t-2', 'rank_t-1', 'rank_t', 'rank_t+3']]
df

Unnamed: 0,brand,base_year,rank_t-4,rank_t-3,rank_t-2,rank_t-1,rank_t,rank_t+3
9,3m,2014,90,85,77,76,66,58
10,3m,2015,85,77,76,66,59,60
11,3m,2016,77,76,66,59,59,64
25,accenture,2006,53,52,50,51,49,45
26,accenture,2007,52,50,51,49,50,47
...,...,...,...,...,...,...,...,...
1795,zara,2012,62,50,48,44,37,30
1796,zara,2013,50,48,44,37,36,27
1797,zara,2014,48,44,37,36,36,24
1798,zara,2015,44,37,36,36,30,25


In [134]:
# Data spliting
from sklearn.model_selection import train_test_split
features_train, features_test, rank_train, rank_test = \
train_test_split(df[['rank_t-4', 'rank_t-3', 'rank_t-2', 'rank_t-1', 'rank_t']], df['rank_t+3'], test_size=0.25, random_state=12580)

In [135]:
import sklearn.metrics as sm
# predict rank_t+3 using AR(5) model

lr = LinearRegression(n_jobs=-1).fit(features_train, rank_train)
prediction = lr.predict(features_test)

mae_e = mean_absolute_error(rank_test, prediction)
R_square = sm.r2_score(rank_test, prediction)
print('mae_a: ', mae_e)
print('R^2: ', R_square)

mae_a:  7.090702897632233
R^2:  0.8224712058400688


In [136]:
from statsmodels.api import OLS
OLS(rank_train,features_train).fit().summary()

0,1,2,3
Dep. Variable:,rank_t+3,R-squared (uncentered):,0.965
Model:,OLS,Adj. R-squared (uncentered):,0.965
Method:,Least Squares,F-statistic:,3263.0
Date:,"Wed, 25 Nov 2020",Prob (F-statistic):,0.0
Time:,16:22:50,Log-Likelihood:,-2141.1
No. Observations:,593,AIC:,4292.0
Df Residuals:,588,BIC:,4314.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
rank_t-4,0.1361,0.089,1.535,0.125,-0.038,0.310
rank_t-3,-0.0969,0.149,-0.649,0.516,-0.390,0.196
rank_t-2,-0.2423,0.155,-1.568,0.117,-0.546,0.061
rank_t-1,-0.4821,0.160,-3.019,0.003,-0.796,-0.169
rank_t,1.6757,0.101,16.513,0.000,1.476,1.875

0,1,2,3
Omnibus:,98.598,Durbin-Watson:,1.917
Prob(Omnibus):,0.0,Jarque-Bera (JB):,421.0
Skew:,0.681,Prob(JB):,3.81e-92
Kurtosis:,6.897,Cond. No.,62.0


In [132]:
# Data spliting with AR(3) instead
from sklearn.model_selection import train_test_split
features_train, features_test, rank_train, rank_test = \
train_test_split(df[['rank_t-2', 'rank_t-1', 'rank_t']], df['rank_t+3'], test_size=0.25, random_state=12580)

In [133]:
OLS(rank_train,features_train).fit().summary()

0,1,2,3
Dep. Variable:,rank_t+3,R-squared (uncentered):,0.965
Model:,OLS,Adj. R-squared (uncentered):,0.965
Method:,Least Squares,F-statistic:,5428.0
Date:,"Wed, 25 Nov 2020",Prob (F-statistic):,0.0
Time:,16:22:01,Log-Likelihood:,-2142.6
No. Observations:,593,AIC:,4291.0
Df Residuals:,590,BIC:,4304.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
rank_t-2,-0.1715,0.098,-1.751,0.081,-0.364,0.021
rank_t-1,-0.4971,0.159,-3.118,0.002,-0.810,-0.184
rank_t,1.6598,0.101,16.469,0.000,1.462,1.858

0,1,2,3
Omnibus:,96.438,Durbin-Watson:,1.922
Prob(Omnibus):,0.0,Jarque-Bera (JB):,449.629
Skew:,0.635,Prob(JB):,2.31e-98
Kurtosis:,7.073,Cond. No.,43.5
