In [None]:
import gensim, emoji
import pandas as pd
from tqdm import tqdm
from MulticoreTSNE import MulticoreTSNE as TSNE #!conda install -c conda-forge multicore-tsne -y
from matplotlib import pyplot as plt
import _lookup_tables
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
np = pd.np
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')

all_emjs = pd.Series(emoji.UNICODE_EMOJI).index.to_series()
nrc_df = pd.read_csv('nrc_selected.csv') # Load NRC lexicon: Has columns 'zh', 'en', 'Sad', etc.

# Select Model

In [None]:
Model = gensim.models.FastText
corpus_name = 'Weibo'
num_epochs = 5
liwc_fpath = 'liwc_cn.csv'
nrc_df = nrc_df.drop_duplicates('zh').drop(['en'], axis=1).set_index('zh').astype(bool)

Model = gensim.models.FastText
corpus_name = 'TwtUs'
num_epochs = 5
liwc_fpath = 'liwc_en.csv'
nrc_df = nrc_df.drop_duplicates('en').drop(['zh'], axis=1).set_index('en').astype(bool)

# --------
model_fpath  = f'gensimModels/{corpus_name}2014{Model.__name__}/0'
model        = Model.load(model_fpath)

# Make t-SNE Maps

In [None]:
## Compress 100D Emoji Vectors To 2D Coordinates

IF_WORK_ON_NRC      = True
IF_WORK_ON_EMJ      = True

def getTokenCnt(token):
    try: return model.wv.vocab.get(token).count
    except AttributeError: return
    
if IF_WORK_ON_EMJ: emj_cnts = dict()

for epoch in range(num_epochs): #with tqdm(range(num_epochs), desc='Epochs') as epochs:
    print(f'Loading Model {corpus_name}2014{Model.__name__}/{epoch}...')
    model   = Model.load(f'gensimModels/{corpus_name}2014{Model.__name__}/{epoch}')
    new_index = pd.Series({k: v.index for k, v in model.wv.vocab.items()})
    vecs_df = pd.DataFrame(model.wv.vectors)
    vecs_df = vecs_df.reindex(new_index)
    vecs_df.index = new_index.index
    print(f'There are {len(vecs_df)} tokens captured in this corpora.')
    if IF_WORK_ON_NRC:
        print('Querying model for all tokens in NRC...')
        # Make nrc categorical vectors:
        def _(tkn):
            try: return model.wv.word_vec(tkn)
            except KeyError: return None
        nrc_tkns_vecs_df = nrc_df.index.to_series().apply(_).dropna().apply(pd.Series)
        nrc_ctgy_vecs_df = nrc_df.apply(lambda c: nrc_tkns_vecs_df.reindex(c[c].index).mean()).T
        print('Saving nrc-related vector lookup tables...')
        nrc_tkns_vecs_df.to_hdf('data.hdf', f'{corpus_name}_{Model.__name__}_epoch{epoch}_nrc_tkns_vecs_df')
        nrc_ctgy_vecs_df.to_hdf('data.hdf', f'{corpus_name}_{Model.__name__}_epoch{epoch}_nrc_ctgy_vecs_df')
    if IF_WORK_ON_EMJ:
        emj_vecs_df= vecs_df.reindex(emoji.UNICODE_EMOJI).dropna() #getVectorsFromModel(model, tokens=emoji.UNICODE_EMOJI)  # Emoji vectors and coordinates:
        emj_vecs_df.to_hdf('data.hdf', f'{corpus_name}_{Model.__name__}_epoch{epoch}_emj_vecs_df') # Save:
        emj_cnts[epoch] = all_emjs.apply(getTokenCnt).dropna().astype(int)
if IF_WORK_ON_EMJ:
    emj_cnts_df = pd.DataFrame(emj_cnts).T
    emj_cnts_df.to_hdf('data.hdf', f'{corpus_name}_{Model.__name__}_emj_cnts_df')

## Plot t-SNE maps

corpora = {'TwtUs', 'Weibo'}
emjs = {corpus_name: set(pd.read_hdf('data.hdf', f'{corpus_name}_{Model.__name__}_emj_cnts_df').columns) for corpus_name in corpora}
common_emjs = set.intersection(*emjs.values())

emjs_sorted_by_occurence = pd.concat([pd.read_hdf('data.hdf', f'TwtUs_{Model.__name__}_emj_cnts_df'),
    pd.read_hdf('data.hdf', f'Weibo_{Model.__name__}_emj_cnts_df')], sort=True).dropna(axis=1).sum().sort_values(ascending=False).index.to_series()

def plotEmojis(emj_coords_df, use_approx=False):
    fig, ax = plt.subplots(figsize=(50, 50))
    emjs_inModel = emj_coords_df.index.to_series()
    emjs_inModel_asUnicode = _lookup_tables.emj2unicode[emjs_inModel].str.replace('_', '-').str.lstrip('0')
    assert emjs_inModel_asUnicode.notna().all()
    assert emjs_inModel_asUnicode.nunique()==len(emjs_inModel_asUnicode)==emjs_inModel.nunique()==len(emjs_inModel)
    emjs_inModel_asImg = _lookup_tables.unicode2img[emjs_inModel_asUnicode]
    emjs_inModel_asImg.index = emjs_inModel
    if use_approx:
        unlinked = pd.Series(emjs_inModel_asImg[emjs_inModel_asImg.isna()].index)
        unlinked_rematched = unicode2img.reindex(unlinked.apply(lambda s: s[:s.rfind('-')] if s.rfind('-')>-1 else None))
        unlinked_rematched.index = unlinked
        unlinked_rematched.dropna(inplace=True)
        emjs_inModel_asImg[unlinked_rematched.index] = unlinked_rematched
    for i, row in emj_coords_df.assign(img=emjs_inModel_asImg).dropna().iterrows():
        ax.add_artist(AnnotationBbox(OffsetImage(row.img), (row.x, row.y), xycoords='data', boxcoords="offset points", frameon=False))
    ax.update_datalim(emj_coords_df.values)
    ax.autoscale()
    return ax

def compressWordVectors(emj_vecs_df, perplexity=30):
    print('Compressing word vectors to coordinates in 2D via t-SNE...')
    emj_coords = TSNE(n_jobs=24, random_state=42, n_iter=10000, perplexity=perplexity).fit_transform(emj_vecs_df.values) # reduce dimension with t-SNE
    emj_coords_df = pd.DataFrame(emj_coords, index=emj_vecs_df.index).rename(columns={0: 'x', 1: 'y'}) # add index
    return emj_coords_df

model         = Model.load(f'gensimModels/{corpus_name}2014{Model.__name__}/0') # Load model. We only have to load one epoch.
emj_vecs_df   = pd.read_hdf('data.hdf', f'{corpus_name}_{Model.__name__}_epoch0_emj_vecs_df').reindex(common_emjs)
emj_coords_df = compressWordVectors(emj_vecs_df, perplexity=15)        # Get coordinates
emj_coords_df.to_hdf('data.hdf', f'{corpus_name}_{Model.__name__}_emj_coords_df')
# Plot:
print('Plotting...')
ax = plotEmojis(emj_coords_df)
ax.set_title(f'{corpus_name} {Model.__name__} t-SNE Map')
plt.savefig(f'tSNEs/{corpus_name}_{Model.__name__}_tSNE.png')

# Compute Emoji-to-Emotion Similarities (i.e., Vectorial Projections)

In [None]:
## Customizable parameters:

IF_REMOVE_AVG_PROJ   = False
IF_SAVE_TO_HDF       = True
IF_AGG_ACROSS_EPOCHS = True

## Utilities: 

from sklearn import preprocessing # l2-normalize the samples (rows). 
getCommonItems = lambda a, b: list(set(a).intersection(b))
from sklearn.metrics.pairwise import cosine_similarity
def project(tkns_vecs_df, axis_vecs_df, remove_common_components=True, remove_common_components_by_group=False, if_normalize_axis=True, if_normalize_tkns=True):
    assert len(tkns_vecs_df.columns)==len(axis_vecs_df.columns), 'Tokens and Axes must be in identical dimentionality.'
    if remove_common_components: # Optionally remove common components from all vectors being considered:
        common_components = pd.concat([tkns_vecs_df, axis_vecs_df], axis=0).mean()
        tkns_vecs_df = tkns_vecs_df-common_components
        axis_vecs_df = axis_vecs_df-common_components
    if remove_common_components_by_group: # Optionally remove common components from all vectors being considered:
        tkns_vecs_df = tkns_vecs_df-tkns_vecs_df.mean()
        axis_vecs_df = axis_vecs_df-axis_vecs_df.mean()
    else:
        if if_normalize_axis: axis_vecs_df = pd.DataFrame(preprocessing.normalize(axis_vecs_df, norm='l2'), index=axis_vecs_df.index)
        if if_normalize_tkns: tkns_vecs_df = pd.DataFrame(preprocessing.normalize(tkns_vecs_df, norm='l2'), index=tkns_vecs_df.index)
    # Calculate projection via cosine similarity:
    return pd.DataFrame(cosine_similarity(tkns_vecs_df, axis_vecs_df), columns=axis_vecs_df.index, index=tkns_vecs_df.index)

## Actual work:

for corpus_name in corpora:
    if IF_AGG_ACROSS_EPOCHS: emj2nrcCtgy_proj_dfs = []
    with tqdm(range(num_epochs), desc=f'Epochs in {corpus_name} {Model.__name__}') as epochs:
        for epoch in epochs:
            emj_vecs_df       = pd.read_hdf('data.hdf', f'{corpus_name}_{Model.__name__}_epoch{epoch}_emj_vecs_df')
            if IF_REMOVE_AVG_PROJ: emj_vecs_df.loc['avg'] = emj_vecs_df.mean()
            nrc_ctgy_vecs_df  = pd.read_hdf('data.hdf', f'{corpus_name}_{Model.__name__}_epoch{epoch}_nrc_ctgy_vecs_df')
            emj2nrcCtgy_proj_df = project(tkns_vecs_df=emj_vecs_df, axis_vecs_df=nrc_ctgy_vecs_df, remove_common_components=False, remove_common_components_by_group=True)
            if IF_REMOVE_AVG_PROJ:
                emj2nrcCtgy_proj_df -= emj2nrcCtgy_proj_df.loc['avg']
                emj2nrcCtgy_proj_df.drop('avg', inplace=True)
            if IF_SAVE_TO_HDF: emj2nrcCtgy_proj_df.to_hdf('data.hdf', f'{corpus_name}_{Model.__name__}_epoch{epoch}_emj2nrcCtgy_proj_df')
            if IF_AGG_ACROSS_EPOCHS: emj2nrcCtgy_proj_dfs.append(emj2nrcCtgy_proj_df)
    if IF_AGG_ACROSS_EPOCHS:
        g = pd.concat(emj2nrcCtgy_proj_dfs).reset_index().rename(columns={'index': 'emj'}).groupby('emj')
        emj2nrcCtgy_proj_avg_df = g.mean()
        emj2nrcCtgy_proj_std_df = g.std()
        # Filter for stable emojis only: .stack()[_<_.describe()['75%']].unstack(1)
        #_ = emj2nrcCtgy_proj_std_df.stack()
        if IF_SAVE_TO_HDF: # Save to HDF:
            emj2nrcCtgy_proj_avg_df.to_hdf('data.hdf', f'{corpus_name}_{Model.__name__}_emj2nrcCtgy_proj_avg_df')
            emj2nrcCtgy_proj_std_df.to_hdf('data.hdf', f'{corpus_name}_{Model.__name__}_emj2nrcCtgy_proj_std_df')

# Plot Radar Plots

In [None]:
# Prepare utilities:
import json
html_template = open('RadarChartsTemplate.html', 'r').read()
emj2ucCtgy = _lookup_tables.emj_ucCtgy.str.replace(r'\W', '')  # prepare Unicode Category look-up table
row2dict = lambda row: list(row.rename('value').reset_index().rename(columns={'index': 'axis'}).T.to_dict().values())
emj2desc = pd.Series(emoji.UNICODE_EMOJI).str[1:-1].str.replace('_', ' ').str.title()

# prepare numerical data:
def fixRangeForRadarPlot(proj_df, IF_FILL_TO_ONE = True, IF_STANDARIZE = False):
    if IF_STANDARIZE: 
        standardize = lambda s: (s-s.min())/(s.max()-s.min())
        proj_df = proj_df.apply(standardize)
    if IF_FILL_TO_ONE: proj_df *= 1/proj_df.abs().max().max() # At this stage, although theoretical range of the consine similarities is -1~+1, we would like to zoom into (while keeping 0 at 0) the values so that the max is 1 OR the min is -1:
    proj_df += 1 # move the range from -1~+1 to 0~2.
    proj_df /= 2 # compress the range from 0~2 to 0~1.
    return proj_df*100
weibo_projs_df = pd.read_hdf('data.hdf', f'Weibo_{Model.__name__}_emj2nrcCtgy_proj_avg_df').reindex(common_emjs)
twitr_projs_df = pd.read_hdf('data.hdf', f'TwtUs_{Model.__name__}_emj2nrcCtgy_proj_avg_df').reindex(common_emjs)
weibo_projs_df = fixRangeForRadarPlot(weibo_projs_df)
twitr_projs_df = fixRangeForRadarPlot(twitr_projs_df)

# Prepare list of Emojis to plot:
order = 'dissimilarity'#'occurence'
takeTop = lambda df: df.head(5).index.to_series().reset_index(drop=True)
if order=='dissimilarity':
    _ = pd.DataFrame({'diff': (weibo_projs_df - twitr_projs_df).abs().mean(axis=1),
                      'ctgy': _lookup_tables.emj_ucCtgy})
    _.dropna(inplace=True)
    _.sort_values('diff', ascending=False, inplace=True)
    _ = _.groupby('ctgy').apply(takeTop).drop(['Flags', 'Activities', 'Symbols'])
elif order=='occurence':
    _ = emjs_sorted_by_occurence.to_frame().assign(ctgy=_lookup_tables.emj_ucCtgy)
    _ = _.groupby('ctgy').apply(takeTop).drop(['Flags', 'Activities', 'Symbols'])
emj_to_plot = _.tolist()

# paint each:
data_to_plot = []
for token in emj_to_plot:
    weibo_row, twitr_row = weibo_projs_df.loc[token], twitr_projs_df.loc[token]
    weibo_row.index = twitr_row.index
    data_to_plot.append([
        emj2ucCtgy[token], # The Unicode Category that this emoji belongs to
        emj2desc[token]  , # The Unicode description of this emoji
        token,  # This very Emoji itself
        {'className': 'Twitter', 'axes': row2dict(twitr_row)},
        {'className': 'Weibo'  , 'axes': row2dict(weibo_row)}])
    
ctgys_used             = emj2ucCtgy[emj_to_plot].drop_duplicates()
ctgys_used_unsanitized = _lookup_tables.emj_ucCtgy[emj_to_plot].drop_duplicates()
open('RadarCharts.html', 'w').write(html_template\
                               .replace('%%%%%%', json.dumps(data_to_plot, ensure_ascii=False))\
                               .replace('^^^^^^', ''.join('<div class="row" id="'+ctgys_used+'"><h1>'+ctgys_used_unsanitized+'</h1></div>'))
                               #.replace('&&&&&&', json.dumps(list(twitr_projs_df.index), ensure_ascii=False)))
                                   )

# Heatmaps

In [None]:
model_name = 'FastText'
corpora = {'TwtUs', 'Weibo'}

emjs = {corpus_name: set(pd.read_hdf('data.hdf', f'{corpus_name}_{model_name}_emj_cnts_df').columns) for corpus_name in corpora}
common_emjs = set.intersection(*emjs.values())

weibo_projs_df = pd.read_hdf('data.hdf', f'Weibo_{model_name}_emj2nrcCtgy_proj_avg_df').reindex(common_emjs)
twitr_projs_df = pd.read_hdf('data.hdf', f'TwtUs_{model_name}_emj2nrcCtgy_proj_avg_df').reindex(common_emjs)

In [None]:
weibo_projs_corr_df = weibo_projs_df.corr('spearman')
twitr_projs_corr_df = twitr_projs_df.corr('spearman')
# Sort the emotions by average correlation:
avg_projs_corr_df = (weibo_projs_corr_df+twitr_projs_corr_df)/2
emo_order = avg_projs_corr_df.mean().sort_values(ascending=False).index
weibo_projs_corr_df = weibo_projs_corr_df.loc[emo_order, emo_order]
twitr_projs_corr_df = twitr_projs_corr_df.loc[emo_order, emo_order]
avg_projs_corr_df   = avg_projs_corr_df  .loc[emo_order, emo_order]


from sklearn.metrics.pairwise import cosine_similarity
def computeAvgEmoSimiDf(corpus_name):
    simi_dfs = []
    for epoch in range(5):
        emos_df = pd.read_hdf('data.hdf', f'{corpus_name}_{model_name}_epoch{epoch}_nrc_ctgy_vecs_df')
        simi_df = pd.DataFrame(cosine_similarity(emos_df-emos_df.mean()), index=emos_df.index, columns=emos_df.index).stack()
        simi_dfs.append(simi_df)
    return pd.concat(simi_dfs, axis=1).mean(axis=1).unstack()
weibo_emo_simi_df = computeAvgEmoSimiDf('Weibo')
twitr_emo_simi_df = computeAvgEmoSimiDf('TwtUs')

# Sort the emotions by average correlation:
avg_emo_simi_df = (weibo_emo_simi_df+twitr_emo_simi_df)/2
#emo_order = avg_emo_simi_df.mean().sort_values(ascending=False).index
weibo_emo_simi_df = weibo_emo_simi_df.loc[emo_order, emo_order]
twitr_emo_simi_df = twitr_emo_simi_df.loc[emo_order, emo_order]
avg_emo_simi_df   = avg_emo_simi_df  .loc[emo_order, emo_order]

In [None]:
fig, ((ax11, ax12), (ax21, ax22)) = plt.subplots(2,2, figsize=(11, 8))
sns.heatmap(weibo_projs_corr_df, annot=True, ax=ax11, vmin=-1, vmax=1, cmap='RdBu_r')
ax11.set_title('Pairwise Correlation in Weibo (China)')
ax11.set_xticks([])
sns.heatmap(twitr_projs_corr_df, annot=True, ax=ax12, vmin=-1, vmax=1, cmap='RdBu_r')
ax11.set_xlabel('(a)')

ax12.set_title('Pairwise Correlation in Twitter (US)')
ax12.set_xticks([])
ax12.set_yticks([])
sns.heatmap(  avg_projs_corr_df, annot=True, ax=ax13, vmin=-1, vmax=1, cmap='RdBu_r')
ax12.set_xlabel('(b)')

sns.heatmap(weibo_emo_simi_df.round(2), annot=True, ax=ax21, vmin=-1, vmax=1, cmap='RdBu_r')
ax21.set_title('Pairwise Similarity in Weibo (China)')
ax21.set_xticklabels(ax21.get_xticklabels(), rotation=45)
ax21.set_xlabel('(c)')

sns.heatmap(twitr_emo_simi_df.round(2), annot=True, ax=ax22, vmin=-1, vmax=1, cmap='RdBu_r')
ax22.set_title('Pairwise Similarity in Twitter (US)')
ax22.set_xticklabels(ax22.get_xticklabels(), rotation=45)
ax22.set_yticks([])
ax22.set_xlabel('(d)')

#plt.suptitle('Correlation between Emotions Using Emoji-to-Emotion Projections - All Shared Emojis')
fig.subplots_adjust(hspace=0.2, wspace=0.0)
plt.show()

# Kolmogorov-Smirnov Statistic

In [None]:
from scipy.stats import ks_2samp
assert (weibo_projs_df.index == twitr_projs_df.index).all()
_ = lambda emo: pd.Series(ks_2samp(weibo_projs_df[emo], twitr_projs_df[emo]), name=emo)
ks_df = pd.concat([_(emo) for emo in weibo_projs_df.columns], axis=1).T
ks_df.columns = ['Kolmogorov-Smirnov Statistic', 'p-Value']
ks_df.sort_values('Kolmogorov-Smirnov Statistic', ascending=False).to_latex('Kolmogorov-Smirnov.tex')

# Violin Plot

In [None]:
y_labels = dict(zip(ks_df.index, ks_df.index+'\n ($D='+ks_df.iloc[:,0].round(3).astype(str)+'$)'))

weibo_projs_longForm_df = weibo_projs_df.rename(columns=y_labels).stack().reset_index()
weibo_projs_longForm_df.columns = ['emj', 'emo', 'val']
weibo_projs_longForm_df['Platform'] = 'Weibo'

twitr_projs_longForm_df = twitr_projs_df.rename(columns=y_labels).stack().reset_index()
twitr_projs_longForm_df.columns = ['emj', 'emo', 'val']
twitr_projs_longForm_df['Platform'] = 'Twitter'

projs_longForm_df = pd.concat([weibo_projs_longForm_df, twitr_projs_longForm_df])

# Plot:
fig, ax = plt.subplots(1,1, figsize=(9, 5))#(5, 10))
sns.violinplot(data=projs_longForm_df, ax=ax, x='emo', y='val', hue='Platform', palette="Set1", split=True, scale="count", bw=.2, inner="quartile")#, orient='h')

#plt.title('Emoji-to-Emotion Correlations')
plt.ylabel('Emoji-Emotion Similarity')
plt.xlabel('')#('Similarity')
plt.show()