In [30]:
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import pandas as pd
import pickle
import scipy.spatial as sp
import scipy.stats as ss
import scipy.sparse as ssp

In [2]:
from nltk.corpus import brown

In [3]:
# data pair from Table 1
value1 = ['cord', 'rooster', 'noon', 'fruit', 'autograph', 'automobile', 'mound', 'grin',\
          'asylum', 'asylum', 'graveyard', 'glass', 'boy', 'cushion', 'monk', 'asylum',\
          'coast','grin', 'shore', 'monk', 'boy', 'automobile', 'mound', 'lad', 'forst',\
          'food', 'cemetery', 'shore', 'bird', 'coast', 'furnace', 'crane', 'hill', 'car',\
          'cemetery', 'glass', 'magician', 'crane', 'brother', 'sage', 'oracle', 'bird', 'bird',\
          'food', 'brother', 'asylum', 'furnace', 'magician', 'hill', 'cord', 'glass', 'grin',\
          'serf', 'journey', 'autograph', 'coast', 'forest', 'implement', 'cock', 'boy', 'cushion',\
          'cemetery', 'automobile', 'midday', 'gem']

value2 = ['smile', 'voyage', 'string', 'furnace', 'shore', 'wizard', 'stove', 'implement', 'fruit',\
          'monk', 'madhouse', 'magician', 'rooster', 'jewel', 'slave', 'cemetery', 'forest', 'lad',\
          'woodland', 'oracle', 'sage', 'cushion', 'shore', 'wizard', 'graveyard', 'rooster',\
          'woodland', 'voyage', 'woodland', 'hill', 'implement', 'rooster', 'woodland', 'journey',\
          'mound', 'jewel', 'oracle', 'implement', 'lad', 'wizard', 'sage', 'crane', 'cock', 'fruit',\
          'monk', 'madhouse', 'stove', 'wizard', 'mound', 'string', 'tumbler', 'smile', 'slave',\
          'voyage', 'signature', 'shore', 'woodland', 'tool', 'rooster', 'lad', 'pillow', 'graveyard',\
          'car', 'moon', 'jewel']
similar_human = [0.02, 0.04, 0.04, 0.05, 0.06, 0.11, 0.14, 0.18, 0.19, 0.39, 0.42, 0.44, 0.44, 0.45,\
                 0.57, 0.79, 0.85, 0.88, 0.90, 0.91, 0.96, 0.97, 0.97, 0.99, 1.00, 1.09, 1.18, 1.22,\
                 1.24, 1.26, 1.37, 1.41, 1.48, 1.55, 1.69, 1.78, 1.82, 2.37, 2.41, 2.46, 2.61, 2.63,\
                 2.63, 2.69, 2.74, 3.04, 3.11, 3.21, 3.29, 3.41, 3.45, 3.46, 3.46, 3.58, 3.59, 3.60,\
                 3.65, 3.66, 3.68, 3.82, 3.84, 3.88, 3.92, 3.94, 3.94]

# construct a dataframe reprenseting the data pairs
table1 = {'word1': value1, 'word2':value2, 'similar_human':similar_human}
df_table1 = pd.DataFrame(data=table1)
print(df_table1)

         word1      word2  similar_human
0         cord      smile           0.02
1      rooster     voyage           0.04
2         noon     string           0.04
3        fruit    furnace           0.05
4    autograph      shore           0.06
..         ...        ...            ...
60     cushion     pillow           3.84
61    cemetery  graveyard           3.88
62  automobile        car           3.92
63      midday       moon           3.94
64         gem      jewel           3.94

[65 rows x 3 columns]


In [4]:
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

# get brown corpus
b_words = brown.words()

# create tokenizer to remove punctuations
tokenizer = RegexpTokenizer(r'\w+')

# remove stopwords and punctuations
filtered_words = [word.lower() for word in b_words if (word.lower() not in stopwords.words('english')) \
                  & (len(tokenizer.tokenize(word)))]

# count frequency and get the most common 5000
fdist = FreqDist(filtered_words)
top_fivek = fdist.most_common(5000)

# covert top_fivek to dataframe
df_topfivek = pd.DataFrame(top_fivek, columns=['word', 'count'])
print(df_topfivek)

            word  count
0            one   3292
1          would   2714
2           said   1961
3            new   1635
4          could   1601
...          ...    ...
4995      emerge     18
4996     proceed     18
4997  remarkably     18
4998   compelled     18
4999      faster     18

[5000 rows x 2 columns]


In [5]:
# construct an empty df for final result
df_shared = pd.DataFrame(columns=df_table1.columns)

# find the shared pairs
for index, row in df_table1.iterrows():
    if (row['word1'] in df_topfivek.word.values) & (row['word2'] in df_topfivek.word.values):
        df_shared = df_shared.append(row, ignore_index=True)
print(df_shared)

        word1    word2  similar_human
0        noon   string           0.04
1       coast   forest           0.85
2       coast     hill           1.26
3         car  journey           1.55
4        food    fruit           2.69
5       coast    shore           3.60
6  automobile      car           3.92


## Word2vec

In [7]:
# load pretrained word2vec
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [62]:
# add embedding to df_shared
df_shared['word1_embedding'] = df_shared['word1'].apply(lambda x: model[x])
df_shared['word2_embedding'] = df_shared['word2'].apply(lambda x: model[x])
print(df_shared)

        word1    word2  similar_human  \
0        noon   string           0.04   
1       coast   forest           0.85   
2       coast     hill           1.26   
3         car  journey           1.55   
4        food    fruit           2.69   
5       coast    shore           3.60   
6  automobile      car           3.92   

                                     word1_embedding  \
0  [-0.18164062, -0.026123047, -0.20996094, -0.01...   
1  [0.02709961, 0.107910156, -0.31054688, -0.1533...   
2  [0.02709961, 0.107910156, -0.31054688, -0.1533...   
3  [0.13085938, 0.008422852, 0.033447266, -0.0588...   
4  [-0.18164062, 0.16503906, -0.16601562, 0.35742...   
5  [0.02709961, 0.107910156, -0.31054688, -0.1533...   
6  [0.13183594, 0.060546875, 0.0154418945, 0.1347...   

                                     word2_embedding  
0  [-0.042236328, 0.080078125, -0.18652344, -0.07...  
1  [0.33789062, 0.17089844, -0.0028381348, 0.0354...  
2  [0.09472656, 0.24414062, 0.0390625, 0.12158203...  
3 

In [63]:
# calculate cosine distance
for index, row in df_shared.iterrows():
    df_shared.at[index, 'similar_cosine'] = 1 - sp.distance.cosine(row['word1_embedding'], row['word2_embedding'])
print(df_shared)

        word1    word2  similar_human  \
0        noon   string           0.04   
1       coast   forest           0.85   
2       coast     hill           1.26   
3         car  journey           1.55   
4        food    fruit           2.69   
5       coast    shore           3.60   
6  automobile      car           3.92   

                                     word1_embedding  \
0  [-0.18164062, -0.026123047, -0.20996094, -0.01...   
1  [0.02709961, 0.107910156, -0.31054688, -0.1533...   
2  [0.02709961, 0.107910156, -0.31054688, -0.1533...   
3  [0.13085938, 0.008422852, 0.033447266, -0.0588...   
4  [-0.18164062, 0.16503906, -0.16601562, 0.35742...   
5  [0.02709961, 0.107910156, -0.31054688, -0.1533...   
6  [0.13183594, 0.060546875, 0.0154418945, 0.1347...   

                                     word2_embedding  similar_cosine  
0  [-0.042236328, 0.080078125, -0.18652344, -0.07...        0.021655  
1  [0.33789062, 0.17089844, -0.0028381348, 0.0354...        0.236098  
2  [0.094

In [64]:
# faltten the similar_human as it's from 0-4 and similar_cosine is from 0 to 1
df_shared['similar_human'] = df_shared['similar_human']/4
print(df_shared)

        word1    word2  similar_human  \
0        noon   string         0.0100   
1       coast   forest         0.2125   
2       coast     hill         0.3150   
3         car  journey         0.3875   
4        food    fruit         0.6725   
5       coast    shore         0.9000   
6  automobile      car         0.9800   

                                     word1_embedding  \
0  [-0.18164062, -0.026123047, -0.20996094, -0.01...   
1  [0.02709961, 0.107910156, -0.31054688, -0.1533...   
2  [0.02709961, 0.107910156, -0.31054688, -0.1533...   
3  [0.13085938, 0.008422852, 0.033447266, -0.0588...   
4  [-0.18164062, 0.16503906, -0.16601562, 0.35742...   
5  [0.02709961, 0.107910156, -0.31054688, -0.1533...   
6  [0.13183594, 0.060546875, 0.0154418945, 0.1347...   

                                     word2_embedding  similar_cosine  
0  [-0.042236328, 0.080078125, -0.18652344, -0.07...        0.021655  
1  [0.33789062, 0.17089844, -0.0028381348, 0.0354...        0.236098  
2  [0.094

In [68]:
# calculate the pearson correlation
pearson_correlation = ss.pearsonr(df_shared['similar_human'], df_shared['similar_cosine'])[0]
p_value = ss.pearsonr(df_shared['similar_human'], df_shared['similar_cosine'])[1]
print('pearson_correlation : ' + str(pearson_correlation))
print('p-value : ' + str(p_value))

pearson_correlation : 0.9440722068532515
p-value : 0.0013784318897755413


## LSA

In [34]:
# find additional values not in top 5 k
import numpy as np

additionalW = []
for index, row in df_table1.iterrows():
    if (row['word1'] not in df_topfivek.word.values):
        additionalW.append(row['word1'])
    if (row['word2'] not in df_topfivek.word.values):
        additionalW.append(row['word2'])
additionalW = np.unique(additionalW)
print(additionalW)

['asylum' 'autograph' 'cemetery' 'cock' 'cord' 'crane' 'cushion' 'forst'
 'furnace' 'gem' 'graveyard' 'grin' 'implement' 'jewel' 'lad' 'madhouse'
 'magician' 'midday' 'monk' 'mound' 'oracle' 'pillow' 'rooster' 'sage'
 'serf' 'signature' 'stove' 'tumbler' 'voyage' 'wizard' 'woodland']


In [35]:
# frequency for new words
additionalC = []
for aw in additionalW:
    freq = fdist.get(aw)
    additionalC.append(fdist.get(aw))
print(additionalC)

[1, 3, 15, 5, 6, 5, 8, None, 11, 4, 7, 13, 4, 1, 6, 1, 4, 5, 16, 11, 2, 8, 3, 2, None, 6, 15, 2, 17, 3, 2]


In [36]:
# bigram
from nltk.util import ngrams
from collections import Counter

bigrams = ngrams(brown.words(), 2)
bigrams_freq = Counter(bigrams)

In [37]:
# new W
df_additionalW = pd.DataFrame({'word': additionalW, 'count': additionalC}, columns=['word', 'count'])
# remove additional word not in corpus at all
df_additionalW.dropna(subset=['count'], inplace=True)

df_W = df_topfivek
df_W = df_W.append(df_additionalW, ignore_index=True)
print(df_W)

          word   count
0          one  3292.0
1        would  2714.0
2         said  1961.0
3          new  1635.0
4        could  1601.0
...        ...     ...
5024     stove    15.0
5025   tumbler     2.0
5026    voyage    17.0
5027    wizard     3.0
5028  woodland     2.0

[5029 rows x 2 columns]


In [38]:
# # construct M1
# df_M1 = pd.DataFrame(columns=['l_word', 'f_word', 'l_word_freq', 'f_word_freq', 'bigram_freq'])

# for l in df_W.word:
#     for f in df_W.word:
#         if l != f:
#             row = {'l_word':l , 'f_word':f, 'l_word_freq': df_W[df_W['word'] == l]['count'].values[0],\
#                    'f_word_freq': df_W[df_W['word'] == f]['count'].values[0], 'bigram_freq': bigrams_freq[(l, f)]}
#             df_M1 = df_M1.append(row, ignore_index=True)

In [None]:
matrix = np.zeros(shape=(df_W.shape[0],df_W.shape[0]))

for i in range(df_W.shape[0]):
    for j in range(df_W.shape[0]):
        l_word = df_W.iloc[i]['word']
        f_word = df_W.iloc[j]['word']
        bigram_freq = bigrams_freq[(l_word, f_word)]
        matrix[i][j] = bigram_freq

In [42]:
df_W.iloc[0]['word']

'one'

In [75]:
# calculate M1+
import math

M1_PMI = []

for index, row in df_M1.iterrows():
    pmi = row['bigram_freq'] / (row['l_word_freq'] * row['f_word_freq'])
    if not (math.isnan(pmi) or pmi == 0):
        pmi = math.log(pmi)
    M1_PMI.append(pmi)

df_M1['PMI'] = M1_PMI
print(df_M1)

         l_word     f_word  l_word_freq  f_word_freq bigram_freq        PMI
0           one      would       3292.0       2714.0          17 -13.172216
1         would       said       2714.0       1961.0           0   0.000000
2          said        new       1961.0       1635.0           0   0.000000
3           new      could       1635.0       1601.0           0   0.000000
4         could       time       1601.0       1598.0           1 -14.754892
...         ...        ...          ...          ...         ...        ...
5079     pillow   cemetery          8.0         15.0           0   0.000000
5080   cemetery  graveyard         15.0          7.0           0   0.000000
5081  graveyard     midday          7.0          5.0           0   0.000000
5082     midday        gem          5.0          4.0           0   0.000000
5083        gem      jewel          4.0          1.0           0   0.000000

[5084 rows x 6 columns]


In [84]:
# use PCA for r2
from pca import pca

# for word in df_M1.l_word.unique():
#     df_M1[df_M1['l_word'] == word] 
    
# df_M1p = df_M1[['l_word', 'f_word', 'PMI']]
df_M1[df_M1['l_word'] == 'would'] 

# M2100 = pca(n_components=10)
# results_M2100 = M2100.fit_transform(df_M1p)
# M210 = pca(n_components=100)
# M2300 = pca(n_components=300)

Unnamed: 0,l_word,f_word,l_word_freq,f_word_freq,bigram_freq,PMI
1,would,said,2714.0,1961.0,0,0.0


# Q1 part2

In [40]:
# read txt file
df_capital = pd.read_csv('semantic2.txt', sep=" ", header=None)
df_capital.columns = ['value1', 'value2', 'value3', 'value4']
print(df_capital)

      value1 value2     value3   value4
0    Algeria  dinar     Angola   kwanza
1    Algeria  dinar  Argentina     peso
2    Algeria  dinar    Armenia     dram
3    Algeria  dinar     Brazil     real
4    Algeria  dinar   Bulgaria      lev
..       ...    ...        ...      ...
861  Vietnam   dong     Russia    ruble
862  Vietnam   dong     Sweden    krona
863  Vietnam   dong   Thailand     baht
864  Vietnam   dong    Ukraine  hryvnia
865  Vietnam   dong        USA   dollar

[866 rows x 4 columns]


In [41]:
# add embedding to df_shared
df_capital_vector = df_capital.applymap(lambda x: model[x])
print(df_capital_vector)

                                                value1  \
0    [-0.171875, 0.10205078, 0.24316406, 0.10449219...   
1    [-0.171875, 0.10205078, 0.24316406, 0.10449219...   
2    [-0.171875, 0.10205078, 0.24316406, 0.10449219...   
3    [-0.171875, 0.10205078, 0.24316406, 0.10449219...   
4    [-0.171875, 0.10205078, 0.24316406, 0.10449219...   
..                                                 ...   
861  [0.21484375, 0.22167969, 0.007171631, 0.177734...   
862  [0.21484375, 0.22167969, 0.007171631, 0.177734...   
863  [0.21484375, 0.22167969, 0.007171631, 0.177734...   
864  [0.21484375, 0.22167969, 0.007171631, 0.177734...   
865  [0.21484375, 0.22167969, 0.007171631, 0.177734...   

                                                value2  \
0    [0.0703125, 0.19726562, -0.19628906, 0.1708984...   
1    [0.0703125, 0.19726562, -0.19628906, 0.1708984...   
2    [0.0703125, 0.19726562, -0.19628906, 0.1708984...   
3    [0.0703125, 0.19726562, -0.19628906, 0.1708984...   
4    [0.07031

In [28]:
# df_capital['predict'] = (df_capital['value1']+df_capital['value2']-df_capital['value3']).equals(df_capital['value4'])
# model.most_similar(positive=['Algeria', 'Angola'], negative=['dinar'])[0]

predict = []

for index, row in df_capital.iterrows():
    predict.append(model.most_similar(positive=[row['value1'], row['value2']], negative=[row['value3']])[0][0])
print(predict)

['dinars', 'dinars', 'dinars', 'dinars', 'riyal', 'dinars', 'dinars', 'riyal', 'dinars', 'dinars', 'dinars', 'dinars', 'dinars', 'dinars', 'dinars', 'dinars', 'Dinar', 'riyal', 'dinars', 'dinars', 'dinars', 'dinars', 'Iraqi_dinar', 'dinars', 'dinars', 'dinars', 'dinars', 'dinars', 'dinars', 'kwanzas', 'kwanzas', 'kwanzas', 'Angolan', 'Angolan', 'Angolan', 'kwanzas', 'Angolan', 'Angolan', 'Angolan', 'kwanzas', 'kwanzas', 'Angolan', 'Angolan', 'Angolan', 'Angolan', 'Angolan', 'Angolan', 'kwanzas', 'kwanzas', 'Angolan', 'Angolan', 'Talatona_Convention_Centre', 'Angolan', 'Angolan', 'Angolan', 'Angolan', 'Angolan', 'kwanzas', 'pesos', 'pesos', 'pesos', 'Argentine', 'pesos', 'pesos', 'pesos', 'pesos', 'pesos', 'pesos', 'pesos', 'pesos', 'pesos', 'pesos', 'pesos', 'pesos', 'pesos', 'pesos', 'pesos', 'pesos', 'pesos', 'pesos', 'pesos', 'pesos', 'pesos', 'Colombian_peso', 'pesos', 'pesos', 'drams', 'drams', 'drams', 'drams', 'drams', 'drams', 'drams', 'drams', 'drams', 'drams', 'drams', 'drams

In [49]:
df_result = pd.DataFrame({'predict': predict}, columns=['predict'])
df_result['test'] = df_capital['value4'].equals(df_result['predict'])

In [50]:
df_result[df_result['test'] == True]

Unnamed: 0,predict,test


# Start actual testing

In [8]:
df_semantic = pd.read_csv('semantic.txt', sep=" ", header=None)
df_semantic.columns = ['value1', 'value2', 'value3', 'value4']

df_syntactic = pd.read_csv('syntactic.txt', sep=" ", header=None)
df_syntactic.columns = ['value1', 'value2', 'value3', 'value4']

In [11]:
# add embedding to df_shared
df_semantic_vector = df_semantic.applymap(lambda x: model[x])

df_syntactic_vector = df_syntactic.applymap(lambda x: model[x])

In [14]:
df_semantic_vector['prediction'] = df_semantic_vector['value1'] + df_semantic_vector['value2'] - df_semantic_vector['value3']

df_syntactic_vector['prediction'] = df_syntactic_vector['value1'] + df_syntactic_vector['value2'] - df_syntactic_vector['value3']

In [23]:
df_semantic_vector['similarity'] = df_semantic_vector.apply(lambda row: 1 - sp.distance.cosine(row['value4'], row['prediction']), axis=1)

df_syntactic_vector['similarity'] = df_syntactic_vector.apply(lambda row: 1 - sp.distance.cosine(row['value4'], row['prediction']), axis=1)

In [29]:
semantic_pos = df_semantic_vector[df_semantic_vector['similarity'] > 0].shape[0]
syntactic_pos = df_syntactic_vector[df_syntactic_vector['similarity'] > 0].shape[0]

print('semantic has ' + str(semantic_pos/df_semantic_vector.shape[0])+ ' positive cosine similarity rate')
print('syntactic has ' + str(syntactic_pos/df_syntactic_vector.shape[0])+ ' positive cosine similarity rate')

semantic has 0.42843477221092907 positive cosine similarity rate
syntactic has 0.17938568714376188 positive cosine similarity rate
