In [1]:
import spacy
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [66]:
import pandas as pd

# load txt file into dataframe
df = pd.read_csv('generalized_swaps.txt', sep='\t', header=None, names=['col1', 'col2'])
# print first 5 rows of dataframe
# sentence1 = [simple_preprocess(str(sentence)) for sentence in df['col1']]
# sentence2 = [simple_preprocess(str(sentence)) for sentence in df['col2']]

sentence1 = ['man', 'boy', 'father', 'son', 'king']
sentence2 = ['woman', 'girl', 'mother', 'daughter', 'queen']

In [59]:
df

Unnamed: 0,col1,col2
0,actor,actress
1,actors,actresses
2,actress,actor
3,actresses,actors
4,airman,airwoman
...,...,...
99,uncles,aunts
100,wife,husband
101,wives,husbands
102,woman,man


In [3]:
nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))

def preprocess(text):
    new_text = []
    docs = nlp(text)
    words = nltk.word_tokenize(text)
    tokenizer = RegexpTokenizer(r'\w+')
    for token in docs:
        if not token.is_punct:
            txt = tokenizer.tokenize(token.text)
            filtered_word = [word for word in txt if word.lower() not in stop_words]
            new_text.extend(filtered_word)
    return ' '.join(new_text)

In [4]:
from gensim.models import Word2Vec

In [5]:
# from nltk import word_tokenize, sent_tokenize
# sentence = [word_tokenize(word) for word in sent_tokenize(sentence1)]
# sentence

In [76]:
import gensim.downloader as api

sentences = sentence1 + sentence2
model = api.load('glove-twitter-25')
# model = Word2Vec(sentences, min_count=1, vector_size=100, workers=4)



In [77]:
model.most_similar('actor')

[('director', 0.8420683145523071),
 ('actress', 0.8414556384086609),
 ('oscar', 0.8299604654312134),
 ('singer', 0.8291831612586975),
 ('comedian', 0.819585919380188),
 ('paul', 0.8014986515045166),
 ('nelson', 0.7945849895477295),
 ('ted', 0.7901637554168701),
 ('martin', 0.7899827361106873),
 ('legendary', 0.7890724539756775)]

In [79]:
word_embed1 = [model[word] for word in sentence1]
word_embed2 = [model[word] for word in sentence2]

In [80]:
mean_col1 = np.mean(word_embed1, axis=0)
mean_col2 = np.mean(word_embed2, axis=0)

In [81]:
gender_direction = mean_col1 - mean_col2

In [82]:
gender_direction /= np.linalg.norm(gender_direction)

In [83]:
word_embed_gender = {}
for word in model.key_to_index:
    embedding = model[word]
    dot_product = np.dot(embedding.reshape(1,-1), gender_direction.T)
    word_embed_gender[word] = dot_product[0]

In [84]:
word_embed_gender

{'<user>': 0.93096,
 '.': 0.94705486,
 ':': 0.84761405,
 'rt': 0.6281977,
 ',': 1.3393447,
 '<repeat>': 0.9120504,
 '<hashtag>': 0.68251884,
 '<number>': 1.6073643,
 '<url>': 0.2238944,
 '!': 0.8705773,
 'i': -1.1956378,
 'a': 1.0930836,
 '"': 0.78504926,
 'the': -1.2697396,
 '?': 1.1218,
 'you': -1.9794163,
 'to': -1.1840047,
 '(': 0.33157992,
 '<allcaps>': 0.2991098,
 '<elong>': 0.9582511,
 ')': 0.33570963,
 'me': 1.2845273,
 'de': 4.0600057,
 '<smile>': 1.0108607,
 '！': -0.5278327,
 'que': 3.676423,
 'and': -1.9745519,
 '。': -0.07235642,
 '-': 0.71010506,
 'my': -1.9746952,
 'no': 1.6259232,
 '、': 0.0027926732,
 'is': -0.7156598,
 'it': -1.2389519,
 '…': 0.8331952,
 'in': -0.29757908,
 'n': 0.11416853,
 'for': -1.26837,
 '/': 0.41427958,
 'of': -0.78626096,
 'la': 3.068149,
 "'s": -1.0584074,
 '*': -0.00033681604,
 'do': -0.5852808,
 "n't": -1.6960962,
 'that': -1.5266854,
 'on': -0.28837308,
 'y': 3.1027443,
 "'": 0.7178288,
 'e': 0.97904724,
 'o': 1.475819,
 'u': 0.24980561,
 'en'

In [85]:
from sklearn.metrics.pairwise import cosine_similarity

def analogy(word_a, word_b, word_c, model):
    embed_a = model[word_a]
    embed_b = model[word_b]
    embed_c = model[word_c]
    
    vec_d = embed_b - embed_a
    closest_word, closest_distance = None, float('inf')
    for word in model.key_to_index:
        if word in[word_a, word_b, word_c]:
            continue
        embedding = model[word].reshape(1,-1)
        analogy_vector = (embed_c - vec_d).reshape(1,-1)
        distance = cosine_similarity(embedding, analogy_vector)
        
        if distance < closest_distance:
            closest_word = word
            closest_distance = distance
    return closest_word

In [None]:
word_a = 'man'
word_b = 'king'
word_c = 'woman'

closest_word = analogy(word_a, word_b, word_c, model)
print(f"{word_a} is to {word_b} as {word_c} is to {closest_word}")