# Metadata

```yaml
Course:    DS 5001 
Module:    09 Lab
Topic:     Using SVD
Author:    R.C. Alvarado
Date:      28 March 2023 (revised)
```

**Purpose:** We create word vectors by applying a singular value decomposition to a pointwise mutual information word-word matrix. 

# Configuration

In [7]:
import configparser
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']

In [8]:
data_prefix = 'novels'
data_in = f"{data_home}/{data_prefix}"

In [9]:
OHCO = ['genre', 'author', 'book', 'chapter', 'para_num', 'sent_num', 'token_num']
BAG = OHCO[2:5] # Paragraphs

# Word Embedding
window = 3

# Libraries

In [10]:
import pandas as pd
import numpy as np
import scipy as sp

# Process

In [11]:
TOKENS = pd.read_csv(f'{data_in}/{data_prefix}-TOKENS.csv')

In [12]:
TOKENS

Unnamed: 0,book,chapter,para_num,sent_num,token_num,pos,term_str,term_id
0,secretadversary,1,0,1,0,DT,the,24127
1,secretadversary,1,0,1,1,NNP,young,27354
2,secretadversary,1,0,1,2,NNP,adventurers,399
3,secretadversary,1,0,1,3,NNP,ltd,14406
4,secretadversary,1,1,0,0,JJ,tommy,24529
...,...,...,...,...,...,...,...,...
1500412,baskervilles,11,114,1,7,RBR,more,15586
1500413,baskervilles,11,114,1,8,JJ,comfortable,4529
1500414,baskervilles,11,114,1,9,IN,outside,16771
1500415,baskervilles,11,114,1,10,IN,than,24112


In [13]:
PARAS = TOKENS.groupby(BAG)[BAG + ['term_str']]\
    .apply(lambda x: x.term_str.tolist())\
    .reset_index(drop=True)

In [14]:
PARAS.head()

0                            [a, scandal, in, bohemia]
1                                                  [i]
2    [to, sherlock, holmes, she, is, always, the, w...
3    [i, had, seen, little, of, holmes, lately, my,...
4    [one, night, it, was, on, the, twentieth, of, ...
dtype: object

In [15]:
TOKENS.head()

Unnamed: 0,book,chapter,para_num,sent_num,token_num,pos,term_str,term_id
0,secretadversary,1,0,1,0,DT,the,24127
1,secretadversary,1,0,1,1,NNP,young,27354
2,secretadversary,1,0,1,2,NNP,adventurers,399
3,secretadversary,1,0,1,3,NNP,ltd,14406
4,secretadversary,1,1,0,0,JJ,tommy,24529


In [16]:
def get_context_words(x):
    data = []
    id  = x.name
    row = x[0]
    for i in range(len(row)):
        data2 = []
        for j in range(-2,3):
            a = i + j
            if a >= 0 and a < len(row):
                data2.append((j, row[a])) 
        data.append(data2)
    return data

In [17]:
TEST = PARAS.to_frame(0).apply(get_context_words, 1)

In [18]:
TEST

0        [[(0, a), (1, scandal), (2, in)], [(-1, a), (0...
1                                               [[(0, i)]]
2        [[(0, to), (1, sherlock), (2, holmes)], [(-1, ...
3        [[(0, i), (1, had), (2, seen)], [(-1, i), (0, ...
4        [[(0, one), (1, night), (2, it)], [(-1, one), ...
                               ...                        
27336    [[(0, and), (1, now), (2, the)], [(-1, and), (...
27337    [[(0, no), (1, sooner), (2, had)], [(-1, no), ...
27338    [[(0, not), (1, hear), (2, it)], [(-1, not), (...
27339    [[(0, as), (1, if), (2, in)], [(-1, as), (0, i...
27340    [[(0, from), (1, that), (2, chamber)], [(-1, f...
Length: 27341, dtype: object

In [19]:
TEST2 = pd.DataFrame([(i, j, item[0], item[1]) 
     for i, row in enumerate(TEST)
        for j, row2 in enumerate(row)
            for item in row2])

In [20]:
TEST2

Unnamed: 0,0,1,2,3
0,0,0,0,a
1,0,0,1,scandal
2,0,0,2,in
3,0,1,-1,a
4,0,1,0,scandal
...,...,...,...,...
7339002,27340,174,0,of
7339003,27340,174,1,usher
7339004,27340,175,-2,house
7339005,27340,175,-1,of


In [21]:
TEST2.columns = ['bag_id', 'window_id', 'offset', 'term_str']

In [22]:
TEST2

Unnamed: 0,bag_id,window_id,offset,term_str
0,0,0,0,a
1,0,0,1,scandal
2,0,0,2,in
3,0,1,-1,a
4,0,1,0,scandal
...,...,...,...,...
7339002,27340,174,0,of
7339003,27340,174,1,usher
7339004,27340,175,-2,house
7339005,27340,175,-1,of


In [28]:
A = TEST2[TEST2.offset == 0].reset_index(drop=True)
B = TEST2[TEST2.offset != 0].reset_index(drop=True)
skipgrams = A.merge(B, on=['bag_id','window_id'], how='left')\
    .rename(columns={'term_str_x':'target','term_str_y':'probe','offset_y':'dist'})

In [31]:
skipgrams = skipgrams[['target','probe','dist']].dropna().sort_values('target').reset_index(drop=True)
skipgrams['dist'] = skipgrams['dist'].astype(int)

In [32]:
skipgrams.head(10)

Unnamed: 0,target,probe,dist
0,a,scandal,1
1,a,drop,-1
2,a,or,2
3,a,or,2
4,a,will,-2
5,a,feel,-1
6,a,bit,1
7,a,of,2
8,a,in,-2
9,a,such,-1


# Get Unigram Probabilities

We have already computed these in the vocab table.

# Import vocab table

In [36]:
VOCAB = pd.read_csv(f'{data_in}/{data_prefix}-VOCAB.csv')

In [37]:
VOCAB.shape[0]

27397

In [38]:
VOCAB.sort_values('p', ascending=False).head()

Unnamed: 0,term_id,term_str,n,p,port_stem,stop,df,idf,tfidf_sum,tfidf_mean,tfidf_max,pos_max
24127,24127,the,85329,0.05687,the,1,320,0.0,0.0,0.0,0.0,DT
24470,24470,to,45176,0.030109,to,1,320,0.0,0.0,0.0,0.0,TO
862,862,and,44991,0.029986,and,1,320,0.0,0.0,0.0,0.0,CC
16459,16459,of,42638,0.028417,of,1,320,0.0,0.0,0.0,0.0,IN
11947,11947,i,32985,0.021984,i,1,316,0.005463,180.193615,0.563105,3.403384,PRP


# Get $P(x)$

In [39]:
p_x = VOCAB[['term_str', 'p']].reset_index().set_index('term_str')['p']

In [40]:
p_x.sort_values(ascending=False).head()

term_str
the    0.056870
to     0.030109
and    0.029986
of     0.028417
i      0.021984
Name: p, dtype: float64

# Compute Normalized PMI for Skipgrams

**PMI**

$log \dfrac{P(x,y)}{P(x)P(y)}$

**NMPI**

$\dfrac{log\dfrac{P(x,y)}{P(x)P(y)}}{-log P(x,y)}$

See [G. Bouma 2009, eq. 7](https://pdfs.semanticscholar.org/1521/8d9c029cbb903ae7c729b2c644c24994c201.pdf)

# Create compressed skipgram table

In [41]:
skipgrams2 = skipgrams.value_counts(['target','probe']).to_frame('n').sort_index()

In [31]:
skipgrams2.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,n
target,probe,Unnamed: 2_level_1
a,a,246
a,aback,1
a,abandon,2
a,abandons,1
a,abated,1
a,abatement,1
a,abbess,4
a,abbey,2
a,abbot,3
a,abc,2


In [42]:
skipgrams.query("target == 'a' & probe == 'a'")

Unnamed: 0,target,probe,dist
274,a,a,-2
313,a,a,2
484,a,a,-2
485,a,a,2
4468,a,a,-2
...,...,...,...
112061,a,a,2
112130,a,a,-2
112131,a,a,2
112283,a,a,2


In [43]:
skipgrams.query("target == 'a' & probe == 'a'").dist.value_counts()

dist
-2    123
 2    123
Name: count, dtype: int64

# Compute $P(x,y)$

In [48]:
N = len(skipgrams)

In [49]:
# N = skipgrams2.n.sum()

In [50]:
N

5838574

**MLE**

No smoothing.

In [59]:
skipgrams2['p_xy'] = skipgrams2.n / N

In [60]:
skipgrams2

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1
a,a,246,4.213358e-05
a,aback,1,1.712747e-07
a,abandon,2,3.425494e-07
a,abandons,1,1.712747e-07
a,abated,1,1.712747e-07
...,...,...,...
ça,y,1,1.712747e-07
émeutes,prefect,1,1.712747e-07
émeutes,serious,1,1.712747e-07
émeutes,several,1,1.712747e-07


# Compute $PMI(x;y)$

In [61]:
skipgrams2['pmi_xy'] = skipgrams2.apply(lambda row: np.log(row.p_xy / (p_x.loc[row.name[0]] * p_x.loc[row.name[1]])), 1)

In [65]:
skipgrams2[skipgrams2.n > 100].sort_values('pmi_xy', ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy,pmi_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gutenberg,tm,112,1.9e-05,7.680727
tm,gutenberg,112,1.9e-05,7.680727
gutenberg,project,178,3e-05,7.59982
project,gutenberg,178,3e-05,7.59982
tm,project,112,1.9e-05,7.59982
project,tm,112,1.9e-05,7.59982
van,helsing,300,5.1e-05,7.07255
helsing,van,300,5.1e-05,7.07255
madame,cheron,138,2.4e-05,6.589633
cheron,madame,138,2.4e-05,6.589633


**Normalize**

In [66]:
skipgrams2['npmi_xy'] = skipgrams2.pmi_xy / -( np.log(skipgrams2.p_xy) )

In [69]:
skipgrams2[skipgrams2.n > 100].sort_values('npmi_xy', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy,pmi_xy,npmi_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
project,gutenberg,178,3e-05,7.59982,0.730877
gutenberg,project,178,3e-05,7.59982,0.730877
helsing,van,300,5.1e-05,7.07255,0.71612
van,helsing,300,5.1e-05,7.07255,0.71612
gutenberg,tm,112,1.9e-05,7.680727,0.707152


# Keep only positives

Changed since lab.

In [81]:
skipgrams2.loc[skipgrams2.npmi_xy < 0, 'pnpmi_xy'] = 0
skipgrams2.loc[skipgrams2.npmi_xy >= 0, 'pnpmi_xy'] = skipgrams2.npmi_xy

In [82]:
skipgrams2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy,pmi_xy,npmi_xy,pnpmi_xy
target,probe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
a,a,246,4.213358e-05,-2.149792,-0.213386,0.0
a,aback,1,1.712747e-07,0.406469,0.026089,0.026089
a,abandon,2,3.425494e-07,-0.487349,-0.032737,0.0
a,abandons,1,1.712747e-07,1.910546,0.122628,0.122628
a,abated,1,1.712747e-07,0.118787,0.007624,0.007624


# Create PNPMI Matrix

In [83]:
SGM = skipgrams2.pnpmi_xy.unstack().fillna(0)

In [84]:
SGM.head()

probe,a,aback,abaft,abandon,abandoned,abandoning,abandons,abasement,abashed,abate,...,zoöphagy,zufalle,zum,zuniga,zusammen,à,æt,ætat,ça,émeutes
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a,0.0,0.026089,0.0,0.0,0.0,0.0,0.122628,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aback,0.026089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abaft,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abandon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abandoned,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# SVD

In [93]:
from scipy import sparse
import scipy.sparse.linalg as linalg

In [94]:
sparse = sparse.csr_matrix(SGM.values)

In [95]:
SVD = linalg.svds(sparse, k=256)

In [96]:
U, S, V = SVD

In [97]:
U.shape, S.shape, V.shape

((27378, 256), (256,), (256, 27378))

In [98]:
word_vecs = U + V.T
word_vecs_norm = word_vecs / np.sqrt(np.sum(word_vecs * word_vecs, axis=1, keepdims=True))

In [99]:
WE = pd.DataFrame(word_vecs_norm, index=SGM.index)
WE.index.name = 'word_str'

In [100]:
WE.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
word_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a,-1.6486e-15,0.002683,0.005754,-2.899529e-15,-0.027072,-0.003568,-2.215347e-15,2.135073e-15,-8.133611e-16,0.004005,...,0.186137,-0.011411,-0.33724,0.044213,-7.289362e-15,0.257007,-0.144529,-0.203741,3.91237e-16,-0.123944
aback,-1.137488e-14,-0.023644,0.068729,-3.754992e-14,0.012601,0.09784,-1.365429e-13,3.447292e-14,2.975146e-14,0.129813,...,-0.111089,-0.056426,-0.141189,0.063454,-8.407949e-15,0.035187,-0.001423,0.037886,-8.871224000000001e-17,-0.04964
abaft,-3.620604e-15,-0.059477,0.017736,2.186513e-14,-0.013989,0.058951,-8.670994e-14,3.605224e-15,-1.833681e-14,-0.043857,...,0.043995,-0.028549,0.072132,-0.011558,1.504227e-15,0.070865,-0.01998,-0.069411,3.783639e-16,-0.024004
abandon,-9.979398e-15,-0.196695,-0.046254,1.296956e-13,0.019419,0.126746,-1.744106e-13,1.819485e-14,-2.478065e-15,-0.012258,...,0.013103,0.201316,0.108387,-0.042111,4.260318e-15,-0.032183,-0.077077,0.15536,-9.216447e-16,-0.118835
abandoned,5.598131e-15,0.065519,-0.114384,2.247151e-14,-0.008122,-0.065323,9.454621e-14,-1.214409e-14,1.83772e-14,0.049477,...,-0.048621,0.029533,0.008425,-0.042332,4.206003e-15,-0.047312,-0.056922,0.11167,1.579719e-16,-0.119828


In [101]:
def word_sims(word, n=10):
    try:
        sims = SGM.loc[word].sort_values(ascending=False).head(n).reset_index().values
        return sims
    except KeyError as e:
        print('Word "{}" not in vocabulary.'.format(word))
        return None

In [102]:
print(word_sims('happy'))

[['transit' 0.45355928441921106]
 ['anniversary' 0.45355928441921106]
 ['prosperous' 0.413127131611673]
 ['supremely' 0.4090697268137657]
 ['swain' 0.38155540016356587]
 ['compleatly' 0.338555446335461]
 ['prospero' 0.338555446335461]
 ['bygone' 0.32866130589539844]
 ['thankfulness' 0.32866130589539844]
 ['dauntless' 0.3200906116028747]]


In [105]:
def word_sim_report(word):
    sims = word_sims(word)
    for sim_word, score in sims:
        context = ' '.join(skipgrams2.loc[sim_word].index.values.tolist()[:5])
        print("{} ({}) {}".format(sim_word.upper(), score, context))
        print('-'*80)

In [107]:
word_sim_report('woman')

PROSING (0.4188501650732945) of old this woman
--------------------------------------------------------------------------------
UNMENTIONABLE (0.4188501650732945) presence some stood woman
--------------------------------------------------------------------------------
REFORMED (0.4188501650732945) a be but woman
--------------------------------------------------------------------------------
SHOD (0.4188501650732945) a elderly slip woman
--------------------------------------------------------------------------------
JACKONET (0.4188501650732945) is or the woman
--------------------------------------------------------------------------------
GRABS (0.4188501650732945) at her married woman
--------------------------------------------------------------------------------
SILHOUETTED (0.4188501650732945) against the was woman
--------------------------------------------------------------------------------
LAUNDRY (0.4111158023726) and as at brought folded
---------------------------------

In [108]:
word_sim_report('man')

LEGGED (0.3405039403413688) a alone an and are
--------------------------------------------------------------------------------
CHESTED (0.32646954828747865) deep man short stood
--------------------------------------------------------------------------------
PAMELA (0.32646954828747865) mackenzies man richardsons tatler
--------------------------------------------------------------------------------
TALLISH (0.32646954828747865) a gaslight man walking
--------------------------------------------------------------------------------
COARSELY (0.32646954828747865) as clad man sized
--------------------------------------------------------------------------------
CLOYS (0.32646954828747865) man only possession which
--------------------------------------------------------------------------------
RECOGNIZES (0.32646954828747865) each his man neighbor
--------------------------------------------------------------------------------
BOLSHEVISM (0.32646954828747865) behind in man russia
-------

In [109]:
word_sim_report('young')

ADVENTURERS (0.4218204316609113) a after all and as
--------------------------------------------------------------------------------
LTD (0.416433825110785) adventurers downwards is responded that
--------------------------------------------------------------------------------
CRATCHITS (0.40303898373373426) about again and as at
--------------------------------------------------------------------------------
BERESFORDS (0.39704423413499956) ah being in letter to
--------------------------------------------------------------------------------
BRIGHAM (0.39704423413499956) great has himself our religion
--------------------------------------------------------------------------------
GENTLEMANLIKE (0.3793799118085404) a man very young
--------------------------------------------------------------------------------
MELCHIOR (0.3793799118085404) accepted bascos offer young
--------------------------------------------------------------------------------
RECLUSE (0.3793799118085404) sensible

# Define some semantic functions

Added after lecture.

In [110]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [111]:
def get_word_vector(term_str):
    """Get a numpy array from the glove matrix and shape for input into cosine function"""
    return SGM.loc[term_str].values.reshape(-1, 1).T

def get_nearest_vector(wv, method='cosine', n=1):
    """Get the nearest word vectors to a given word vector"""
    if method == 'cosine':
        sims = cosine_similarity(SGM.values, wv)
    elif method == 'euclidean':
        eds = euclidean_distances(SGM.values, wv)
        sims = 1 - (eds/eds.max())
    else:
        print('Invalid method {}; defaulting to cosine.'.format(method))
        sims = cosine_similarity(SGM.values, wv)
    return pd.DataFrame(sims, index=SGM.index, columns=['score']).sort_values('score',ascending=False).head(n+1).iloc[1:]

def get_sims(term_str, method='cosine', n=10):
    """Get the top n words for a given word based on cosine similarity"""
    wv = get_word_vector(term_str)
    sims =  get_nearest_vector(wv, method=method, n=n) 
    return sims

def get_analogy(a, b, c, method='cosine'):
    """Infer missing analogical term"""
    print()
    try:
        A = get_word_vector(a)
        B = get_word_vector(b)
        C = get_word_vector(c)
        D = np.add(np.subtract(B, A), C)
        X = get_nearest_vector(C, method=method, n=1)
        return X.iloc[0].name
    except ValueError as e:
        print(e)
        return None

In [112]:
get_nearest_vector(get_word_vector('woman'),  n=10)

Unnamed: 0_level_0,score
word_str,Unnamed: 1_level_1
man,0.137132
gentleman,0.121738
girl,0.118498
fellow,0.09908
enough,0.09755
lady,0.096753
women,0.092656
creature,0.088015
friend,0.087107
very,0.086467


In [118]:
def get_opposite(a, b, method='cosine'):
    A = get_word_vector(a)
    B = get_word_vector(b)
    C = np.subtract(A, B)
    X = get_nearest_vector(C, n=1, method=method)
    return X
#     return X.iloc[0].name

In [114]:
get_sims('woman')

Unnamed: 0_level_0,score
word_str,Unnamed: 1_level_1
man,0.137132
gentleman,0.121738
girl,0.118498
fellow,0.09908
enough,0.09755
lady,0.096753
women,0.092656
creature,0.088015
friend,0.087107
very,0.086467


In [115]:
test = get_nearest_vector(get_word_vector('king'), n=10)

In [116]:
test

Unnamed: 0_level_0,score
word_str,Unnamed: 1_level_1
versus,0.171022
wargrave,0.168119
rents,0.143497
longitudinal,0.129109
smollet,0.127084
felstein,0.115622
intuitions,0.11444
litre,0.113686
hanover,0.110776
disavowed,0.108533


In [117]:
get_sims('love')

Unnamed: 0_level_0,score
word_str,Unnamed: 1_level_1
loved,0.109465
affection,0.104016
your,0.098915
friendship,0.085879
tenderness,0.085541
esteem,0.08511
pity,0.081409
florentine,0.080394
her,0.079545
valancourt,0.079072


In [72]:
get_opposite('man','beard')

Unnamed: 0_level_0,score
word_str,Unnamed: 1_level_1
too,0.13037


In [119]:
get_analogy('man','boy','girl')




'woman'

In [65]:
get_analogy('male', 'king', 'female')




'garbed'

In [66]:
SGM

probe,a,aback,abaft,abandon,abandoned,abandoning,abandons,abasement,abashed,abate,...,zoöphagy,zufalle,zum,zuniga,zusammen,à,æt,ætat,ça,émeutes
word_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a,-0.213386,0.026089,0.0,-0.032737,0.0,0.0,0.122628,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aback,0.026089,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abaft,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abandon,-0.032737,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abandoned,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
à,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
æt,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ætat,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ça,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
