## Introduction
In this notebook, we aim to predict how a brand will rank on the interbrand ranking 3 years later based on its past 5 years ranking.

The following files are needed to run the codes:

1. the interbrand ranking by year json: `interbrand_brand2rankvalue.json` (on github)
2. `interbrand_brand2freq.json` (on github)

In [1]:
# import packages
import matplotlib
#matplotlib.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np 
import scipy as sp
import scipy.spatial
import scipy.linalg
import json
import pandas as pd
from sklearn.linear_model import LinearRegression

In [2]:
## Create dataset

print('read files')

with open('interbrand_brand2freq.json') as f:
    brand2year2freq = json.load(f)

with open('interbrand_brand2rankvalue.json') as f:
    brand2year2rankvalue = json.load(f)

brands = sorted(list(brand2year2freq))
brands = [b for b in brands]

Ts = range(2005,2017)
rank_if_not_list = 101
n_feature = 14

dataset = []
header = ['brand', 'base_year']  + \
['rank_t-4','rank_t-3', 'rank_t-2', 'rank_t-1', 'rank_t','rank_t+3','isonlist_t-4','isonlist_t-3','isonlist_t-2','isonlist_t-1','isonlist_t','isonlist_t+3',]

# if not on the list, rank=101
for brand in brands:
    for t in Ts:
        rank_tb4 = brand2year2rankvalue[brand].get(str(t-4), (rank_if_not_list,))[0]
        rank_tb3 = brand2year2rankvalue[brand].get(str(t-3), (rank_if_not_list,))[0] 
        rank_tb2 = brand2year2rankvalue[brand].get(str(t-2), (rank_if_not_list,))[0]
        rank_tb1 = brand2year2rankvalue[brand].get(str(t-1), (rank_if_not_list,))[0]
        rank_t = brand2year2rankvalue[brand].get(str(t), (rank_if_not_list,))[0]
        rank_tf3 = brand2year2rankvalue[brand].get(str(t+3), (rank_if_not_list,))[0]

        ison_tb4 = rank_tb4 < rank_if_not_list
        ison_tb3 = rank_tb3 < rank_if_not_list
        ison_tb2 = rank_tb2 < rank_if_not_list
        ison_tb1 = rank_tb1 < rank_if_not_list
        ison_t = rank_t < rank_if_not_list
        ison_tf3 = rank_tf3 < rank_if_not_list

        current_row = [brand, t] + \
        [rank_tb4, rank_tb3, rank_tb2, rank_tb1, rank_t, rank_tf3, ison_tb4, ison_tb3, ison_tb2, ison_tb1, ison_t, ison_tf3]
        current_row = [str(c) for c in current_row]
        assert(len(current_row)==n_feature)
        dataset.append(current_row)


dataset = [','.join(row)+'\n' for row in dataset]
dataset = [','.join(header)+'\n'] + dataset
with open('interbrand_dataset.csv', 'w') as f:
    for row in dataset:
        f.write(row)
print('done')

read files
done


In [3]:
df = pd.read_csv('interbrand_dataset.csv',encoding='latin1')
df

Unnamed: 0,brand,base_year,rank_t-4,rank_t-3,rank_t-2,rank_t-1,rank_t,rank_t+3,isonlist_t-4,isonlist_t-3,isonlist_t-2,isonlist_t-1,isonlist_t,isonlist_t+3
0,3m,2005,101,99,101,101,101,101,False,True,False,False,False,False
1,3m,2006,99,101,101,101,101,101,True,False,False,False,False,False
2,3m,2007,101,101,101,101,101,90,False,False,False,False,False,True
3,3m,2008,101,101,101,101,101,85,False,False,False,False,False,True
4,3m,2009,101,101,101,101,101,77,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1807,zurich,2012,101,101,94,94,101,101,False,False,True,True,False,False
1808,zurich,2013,101,94,94,101,101,101,False,True,True,False,False,False
1809,zurich,2014,94,94,101,101,101,101,True,True,False,False,False,False
1810,zurich,2015,94,101,101,101,101,101,True,False,False,False,False,False


In [4]:
# consider only brands that having all ranking data in the given period

selected = df[df['isonlist_t-4'] & df['isonlist_t-3'] & df['isonlist_t-2'] & df['isonlist_t-1'] & df['isonlist_t']& df['isonlist_t+3']]
df = selected[['brand', 'base_year', 'rank_t-4', 'rank_t-3', 'rank_t-2', 'rank_t-1', 'rank_t', 'rank_t+3']]
df

Unnamed: 0,brand,base_year,rank_t-4,rank_t-3,rank_t-2,rank_t-1,rank_t,rank_t+3
9,3m,2014,90,85,77,76,66,58
10,3m,2015,85,77,76,66,59,60
11,3m,2016,77,76,66,59,59,64
25,accenture,2006,53,52,50,51,49,45
26,accenture,2007,52,50,51,49,50,47
...,...,...,...,...,...,...,...,...
1795,zara,2012,62,50,48,44,37,30
1796,zara,2013,50,48,44,37,36,27
1797,zara,2014,48,44,37,36,36,24
1798,zara,2015,44,37,36,36,30,25


In [5]:
# Data spliting
from sklearn.model_selection import train_test_split
features_train, features_test, rank_train, rank_test = \
train_test_split(df[['rank_t-4', 'rank_t-3', 'rank_t-2', 'rank_t-1', 'rank_t']], df['rank_t+3'], test_size=0.25, random_state=12580)

In [6]:
import sklearn.metrics as skm
from sklearn.metrics import mean_absolute_error

# predict rank_t+3 using AR(5) model

lr = LinearRegression(n_jobs=-1).fit(features_train, rank_train)
prediction = lr.predict(features_test)

mae_e = mean_absolute_error(rank_test, prediction)
R_square = skm.r2_score(rank_test, prediction)
print('mae_a: ', mae_e)
print('R^2: ', R_square)


mae_a:  7.090702897632233
R^2:  0.8224712058400688
[ 0.0914439  -0.08210152 -0.22787619 -0.48382107  1.64320625]


In [10]:
print(lr.coef_)
print(lr.intercept_)

[ 0.0914439  -0.08210152 -0.22787619 -0.48382107  1.64320625]
2.826436202612996


In [7]:
from statsmodels.api import OLS
import statsmodels.api as sm

features_train = sm.add_constant(features_train)
OLS(rank_train,features_train).fit().summary()

0,1,2,3
Dep. Variable:,rank_t+3,R-squared:,0.887
Model:,OLS,Adj. R-squared:,0.886
Method:,Least Squares,F-statistic:,920.9
Date:,"Thu, 10 Dec 2020",Prob (F-statistic):,4.59e-275
Time:,20:14:56,Log-Likelihood:,-2132.5
No. Observations:,593,AIC:,4277.0
Df Residuals:,587,BIC:,4303.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.8264,0.682,4.145,0.000,1.487,4.166
rank_t-4,0.0914,0.088,1.037,0.300,-0.082,0.265
rank_t-3,-0.0821,0.147,-0.557,0.578,-0.371,0.207
rank_t-2,-0.2279,0.153,-1.494,0.136,-0.527,0.072
rank_t-1,-0.4838,0.158,-3.072,0.002,-0.793,-0.174
rank_t,1.6432,0.100,16.364,0.000,1.446,1.840

0,1,2,3
Omnibus:,119.784,Durbin-Watson:,1.923
Prob(Omnibus):,0.0,Jarque-Bera (JB):,445.559
Skew:,0.892,Prob(JB):,1.77e-97
Kurtosis:,6.853,Cond. No.,201.0


In [11]:
# Data spliting with AR(3) instead
from sklearn.model_selection import train_test_split
features_train, features_test, rank_train, rank_test = \
train_test_split(df[['rank_t-2', 'rank_t-1', 'rank_t']], df['rank_t+3'], test_size=0.25, random_state=12580)

In [12]:
features_train = sm.add_constant(features_train)
OLS(rank_train,features_train).fit().summary()

0,1,2,3
Dep. Variable:,rank_t+3,R-squared:,0.887
Model:,OLS,Adj. R-squared:,0.886
Method:,Least Squares,F-statistic:,1536.0
Date:,"Thu, 10 Dec 2020",Prob (F-statistic):,5.46e-278
Time:,20:21:19,Log-Likelihood:,-2133.1
No. Observations:,593,AIC:,4274.0
Df Residuals:,589,BIC:,4292.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.9390,0.672,4.371,0.000,1.618,4.260
rank_t-2,-0.2024,0.097,-2.092,0.037,-0.392,-0.012
rank_t-1,-0.4920,0.157,-3.133,0.002,-0.800,-0.184
rank_t,1.6336,0.099,16.427,0.000,1.438,1.829

0,1,2,3
Omnibus:,118.852,Durbin-Watson:,1.927
Prob(Omnibus):,0.0,Jarque-Bera (JB):,461.612
Skew:,0.871,Prob(JB):,5.78e-101
Kurtosis:,6.956,Cond. No.,152.0
