## Introduction
In this notebook, we aim to predict how a brand will rank on the interbrand ranking based on its past ranking, frequency, and word embedding.

The following files are needed to run the codes:

1. the word embedding model file: `L10T50G100A1ngV_iter1.p` (sent via google drive because it is too large for github)
2. the word to id json: `w2id_glove_corpora_minc_100.json` (on github)
3. the interbrand ranking by year json: `interbrand_brand2rankvalue.json` (on github)

In [None]:
# import packages
import matplotlib
#matplotlib.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np 
import scipy as sp
import scipy.spatial
import scipy.linalg
import json
import os
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, MDS
from datetime import datetime
from time import time
import pickle
import sys
import pandas as pd
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, roc_auc_score, mean_absolute_error, plot_roc_curve, precision_recall_fscore_support

In [None]:
## this is the sample code for using the word embedding model
# model file path
# modify according to your setup
file_dir = '/Users/vincent/GoogleDriveSync/NTUCourse/BerkeleyCourse/Ming Hsu Lab/various_embedding/pierre2/stereotyping_word2vec_nodata/scraping/nytimes_2011/'
dw2v_filepath = file_dir + 'L10T50G100A1ngV_iter4.p' 
w2id_path = file_dir + 'w2id_glove_corpora_1996-2019_minc_100.json'

# define dynamic word vec class 
class DynamicWordVec():
    def __init__(self, dw2v_filepath, w2id_path):
        with open(dw2v_filepath, 'rb') as f:
            self.wordvec_matrix = pickle.load(f)
            self.num_periods = len(self.wordvec_matrix)

        with open(w2id_path) as f:
            self.w2id = json.load(f)
    
    def get_vec(self, w, yr):
        return self.wordvec_matrix[yr][self.w2id[w], :]

    def sim_by_vec(self, v1, v2, sim_type='cosim'):
        if sim_type == 'inner':
            return np.inner(v1, v2)
        elif sim_type == 'cosim':
            return 1 - sp.spatial.distance.cosine(v1, v2)
        else:
            raise Exception('sim_type should be either "inner" or "cosim"')
    
    def sim_by_word_year(self, w1, y1, w2, y2, sim_type='cosim'):
        v1 = self.wordvec_matrix[y1][self.w2id[w1], :]
        v2 = self.wordvec_matrix[y2][self.w2id[w2], :]
        return self.sim_by_vec(v1, v2, sim_type=sim_type)
    
    def is_in_vocab(self, w):
        return w in self.w2id
    
    def most_similar_words_in_year(self, w1, y1, topn, y2, include_self, sim_type='cosim'):
        temp_dict = {}
        for word in self.w2id:
            cont = True
            if not include_self:
                if word == w1:
                    cont = False
            if cont:
                temp_dict[word] = self.sim_by_word_year(w1, y1, word, y2)
        return sorted(temp_dict.items(), key=lambda x: x[1], reverse=True)[:topn]

In [None]:
'''
dw2v = DynamicWordVec(dw2v_filepath, w2id_path) # load the trained dynamic word embedding model

start_t = 1996 # the first year of the corpora
target_word = 'obama' # target word of your choice
target_word_yr = 2011 # the year of the target word, again, feel free to change it

result_word_yrs = range(2015, 2020) 

most_sim_words = [dw2v.most_similar_words_in_year(target_word, target_word_yr-start_t, 10, yr-start_t, True) for yr in result_word_yrs]
for i, items in enumerate(most_sim_words):
    # this example gets the most similar word of twitter-2011 in each year
    string_to_print = ['{}({:.2f})'.format(w, sim) for w, sim in items]
    print(i+start_t, string_to_print)
'''

In [None]:
## Create dataset

print('read files')
dw2v = DynamicWordVec(dw2v_filepath, w2id_path)

with open('./interbrand_brand2freq.json') as f:
    brand2year2freq = json.load(f)

with open('./interbrand_brand2rankvalue.json') as f:
    brand2year2rankvalue = json.load(f)

with open('./glove_corpora_totalwordcount.json') as f:
    year2count = json.load(f)

brands = sorted(list(brand2year2freq))
brands = [b for b in brands if dw2v.is_in_vocab(b)]

Ts = range(2001, 2019)
corpurs_start_t = 1996
rank_if_not_list = 101
n_feature = 67

dataset = []
header = ['brand_year', 'brand', 'year'] + ['wordvec_{}'.format(i) for i in range(1, 51)] + \
    ['freq_t-4', 'freq_t-3', 'freq_t-2', 'freq_t-1', 'freq_t'] + \
    ['ratiopermille_t-4', 'ratiopermille_t-3', 'ratiopermille_t-2', 'ratiopermille_t-1', 'ratiopermille_t'] + \
    ['rank_t', 'rank_t+1', 'isonlist_t', 'isonlist_t+1']
print(len(header))
assert(len(header)==n_feature)
for brand in brands:
    for t in Ts:
        #print(brand, t)
        wordvec = dw2v.get_vec(brand, t-corpurs_start_t).tolist() # dim-50
        freq_tm4_to_t = [brand2year2freq[brand][str(s)] for s in range(t-4, t+1)]
        totalwordcount_tm4_to_t = [year2count[str(s)] for s in range(t-4, t+1)]
        ratiopermille_tm4_to_t = [freq*1000 / count for freq, count in zip(freq_tm4_to_t, totalwordcount_tm4_to_t)]

        rank_t = brand2year2rankvalue[brand].get(str(t), (rank_if_not_list,))[0] # if not on the list, rank=101
        rank_tp1 = brand2year2rankvalue[brand].get(str(t+1), (rank_if_not_list,))[0]

        ison_t = rank_t < rank_if_not_list
        ison_tp1 = rank_tp1 < rank_if_not_list

        brandyear = '{}-{}'.format(brand, t)

        current_row = [brandyear, brand, t] + wordvec + freq_tm4_to_t + ratiopermille_tm4_to_t + [rank_t, rank_tp1, ison_t, ison_tp1]
        current_row = [str(c) for c in current_row]
        assert(len(current_row)==n_feature)
        dataset.append(current_row)


dataset = [','.join(row)+'\n' for row in dataset]
dataset = [','.join(header)+'\n'] + dataset
with open('interbrand_dataset.csv', 'w') as f:
    for row in dataset:
        f.write(row)
print('done')

In [None]:
# train/valid/test split

df = pd.read_csv('interbrand_dataset.csv')

# consider only brands with 1000+ freq
#brand1000p = [b for b in brands if any([brand2year2freq[b][str(y)]>500 for y in range(2001, 2019)])]
#df = df.loc[df['brand'].isin(brand1000p)]

n_data = len(df)
n_train = int(n_data*0.7)
n_valid = int(n_data*0.2)
n_test = n_data - n_train - n_valid
assert(n_train + n_valid + n_test == n_data)

print(n_train, n_valid, n_test, n_data)

group_labels = np.repeat(['train', 'valid', 'test'], [n_train, n_valid, n_test])
randomized_labels = np.random.choice(group_labels, n_data, replace=False)
df['group'] = randomized_labels

df_train = df[df['group']=='train']
df_valid = df[df['group']=='valid']
df_test = df[df['group']=='test']

df_train = df_train.drop(columns=['group'])
df_valid = df_valid.drop(columns=['group'])
df_test = df_test.drop(columns=['group'])

df_valid_negt = df_valid.loc[df_valid['isonlist_t'] == False]
df_valid_post = df_valid.loc[df_valid['isonlist_t'] == True]
print(len(df_valid_negt))
print(len(df_valid_post))

In [None]:
# predict ison_t+1 using freq with logisitic regression
feature_list_a = ['freq_t-4', 'freq_t-3', 'freq_t-2', 'freq_t-1', 'freq_t']

features_a = df_train[feature_list_a].to_numpy()
target_a = df_train['isonlist_t+1'].to_numpy()

valid_features_a = df_valid[feature_list_a].to_numpy()
valid_target_a = df_valid['isonlist_t+1'].to_numpy()

lr = LogisticRegression(n_jobs=-1).fit(features_a, target_a)
hat_valid_target_a = lr.predict(valid_features_a)
prob_valid_target_a = lr.predict_proba(valid_features_a)
prob_valid_target_a = [p[1] for p in prob_valid_target_a]

acc_a = accuracy_score(valid_target_a, hat_valid_target_a)
auc_a = roc_auc_score(valid_target_a, prob_valid_target_a)

print('acc_a: ', acc_a)
print('auc_a: ', auc_a)

plot_roc_curve(lr, valid_features_a, valid_target_a)
plt.savefig('roc_a.png')

precision, recall, f1, support = precision_recall_fscore_support(valid_target_a, hat_valid_target_a, average='binary')
print('precision:{}, recall:{}, f1:{}, support:{}'.format(precision, recall, f1, support))

In [None]:
feature_list_b = ['rank_t']

features_b = df_train[feature_list_b].to_numpy()
target_b = df_train['isonlist_t+1'].to_numpy()

valid_features_b = df_valid[feature_list_b].to_numpy()
valid_target_b = df_valid['isonlist_t+1'].to_numpy()

lr = LogisticRegression(n_jobs=-1).fit(features_b, target_b)
hat_valid_target_b = lr.predict(valid_features_b)
prob_valid_target_b = lr.predict_proba(valid_features_b)
prob_valid_target_b = [p[1] for p in prob_valid_target_b]

acc_b = accuracy_score(valid_target_b, hat_valid_target_b)
auc_b = roc_auc_score(valid_target_b, prob_valid_target_b)

print('acc_a: ', acc_b)
print('auc_a: ', auc_b)

plot_roc_curve(lr, valid_features_b, valid_target_b)
plt.savefig('roc_b.png')

precision, recall, f1, support = precision_recall_fscore_support(valid_target_b, hat_valid_target_b, average='binary')
print('precision:{}, recall:{}, f1:{}, support:{}'.format(precision, recall, f1, support))

In [None]:
feature_list_c = ['wordvec_{}'.format(i) for i in range(1, 51)] + ['freq_t-4', 'freq_t-3', 'freq_t-2', 'freq_t-1', 'freq_t']

features_c = df_train[feature_list_c].to_numpy()
target_c = df_train['isonlist_t+1'].to_numpy()

valid_features_c = df_valid[feature_list_c].to_numpy()
valid_target_c = df_valid['isonlist_t+1'].to_numpy()

lr = LogisticRegression(n_jobs=-1).fit(features_c, target_c)
hat_valid_target_c = lr.predict(valid_features_c)
prob_valid_target_c = lr.predict_proba(valid_features_c)
prob_valid_target_c = [p[1] for p in prob_valid_target_c]

acc_c = accuracy_score(valid_target_c, hat_valid_target_c)
auc_c = roc_auc_score(valid_target_c, prob_valid_target_c)

print('acc_a: ', acc_c)
print('auc_a: ', auc_c)

plot_roc_curve(lr, valid_features_c, valid_target_c)
plt.savefig('roc_c.png')

precision, recall, f1, support = precision_recall_fscore_support(valid_target_c, hat_valid_target_c, average='binary')
print('precision:{}, recall:{}, f1:{}, support:{}'.format(precision, recall, f1, support))

In [None]:
feature_list_c = ['wordvec_{}'.format(i) for i in range(1, 51)] + ['rank_t']

features_c = df_train[feature_list_c].to_numpy()
target_c = df_train['isonlist_t+1'].to_numpy()

valid_features_c = df_valid[feature_list_c].to_numpy()
valid_target_c = df_valid['isonlist_t+1'].to_numpy()

lr = LogisticRegression(n_jobs=-1).fit(features_c, target_c)
hat_valid_target_c = lr.predict(valid_features_c)
prob_valid_target_c = lr.predict_proba(valid_features_c)
prob_valid_target_c = [p[1] for p in prob_valid_target_c]

acc_c = accuracy_score(valid_target_c, hat_valid_target_c)
auc_c = roc_auc_score(valid_target_c, prob_valid_target_c)

print('acc_a: ', acc_c)
print('auc_a: ', auc_c)

plot_roc_curve(lr, valid_features_c, valid_target_c)
plt.savefig('roc_c.png')

precision, recall, f1, support = precision_recall_fscore_support(valid_target_c, hat_valid_target_c, average='binary')
print('precision:{}, recall:{}, f1:{}, support:{}'.format(precision, recall, f1, support))

In [None]:
feature_list_c = ['wordvec_{}'.format(i) for i in range(1, 51)]

features_c = df_train[feature_list_c].to_numpy()
target_c = df_train['isonlist_t+1'].to_numpy()

valid_features_c = df_valid[feature_list_c].to_numpy()
valid_target_c = df_valid['isonlist_t+1'].to_numpy()

lr = LogisticRegression(n_jobs=-1).fit(features_c, target_c)
hat_valid_target_c = lr.predict(valid_features_c)
prob_valid_target_c = lr.predict_proba(valid_features_c)
prob_valid_target_c = [p[1] for p in prob_valid_target_c]

acc_c = accuracy_score(valid_target_c, hat_valid_target_c)
auc_c = roc_auc_score(valid_target_c, prob_valid_target_c)

print('acc_a: ', acc_c)
print('auc_a: ', auc_c)

plot_roc_curve(lr, valid_features_c, valid_target_c)
plt.savefig('roc_c.png')

precision, recall, f1, support = precision_recall_fscore_support(valid_target_c, hat_valid_target_c, average='binary')
print('precision:{}, recall:{}, f1:{}, support:{}'.format(precision, recall, f1, support))

In [None]:
feature_list_d = ['wordvec_{}'.format(i) for i in range(1, 51)] + ['rank_t']

features_d = df_train[feature_list_d].to_numpy()
target_d = df_train['rank_t+1'].to_numpy()

valid_features_d = df_valid[feature_list_d].to_numpy()
valid_target_d = df_valid['rank_t+1'].to_numpy()

lr = LinearRegression(n_jobs=-1).fit(features_d, target_d)
hat_valid_target_d = lr.predict(valid_features_d)

mae_d = mean_absolute_error(valid_target_d, hat_valid_target_d)

print('mae_a: ', mae_d)

In [None]:
feature_list_e = ['freq_t-4', 'freq_t-3', 'freq_t-2', 'freq_t-1', 'freq_t', 'rank_t']

features_e = df_train[feature_list_e].to_numpy()
target_e = df_train['rank_t+1'].to_numpy()

valid_features_e = df_valid[feature_list_e].to_numpy()
valid_target_e = df_valid['rank_t+1'].to_numpy()

lr = LinearRegression(n_jobs=-1).fit(features_e, target_e)
hat_valid_target_e = lr.predict(valid_features_e)

mae_e = mean_absolute_error(valid_target_e, hat_valid_target_e)

print('mae_a: ', mae_e)

In [None]:
feature_list_f = ['rank_t']

features_f = df_train[feature_list_f].to_numpy()
target_f = df_train['rank_t+1'].to_numpy()

valid_features_f = df_valid[feature_list_f].to_numpy()
valid_target_f = df_valid['rank_t+1'].to_numpy()

lr = LinearRegression(n_jobs=-1).fit(features_f, target_f)
hat_valid_target_f = lr.predict(valid_features_f)

mae_f = mean_absolute_error(valid_target_f, hat_valid_target_f)

print('mae_a: ', mae_f)