# IIC-3800 Tópicos en CC - NLP UC

- Versiones de librerías, python 3.8.10

- numpy 1.20.3
- nltk 3.7
- gensim 4.1.2


Skip-grams

In [1]:
import pandas as pd

data_df = pd.read_csv('mbti_1.csv')
data_df.head()


Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [2]:
data_df.dropna(inplace=True)
data_df.reset_index(inplace=True,drop=True)
posts = data_df['posts'].drop_duplicates()

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer

# Load stop-words
stop_words = set(stopwords.words('english'))

# Initialize tokenizer
# It's also possible to try with a stemmer or to mix a stemmer and a lemmatizer
tokenizer = RegexpTokenizer('[\'a-zA-Z]+')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def tokenize(document):
    words = []

    for sentence in sent_tokenize(document):
        tokens = [lemmatizer.lemmatize(t.lower()) for t in tokenizer.tokenize(sentence) if t.lower() not in stop_words and len(t) > 2]
        words += tokens

    return words


In [4]:
corpus = []

for raw_text in posts:
    words = tokenize(raw_text)
    corpus.append(words)

In [5]:
from gensim.models import Word2Vec

model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4, sg=1)

classgensim.models.word2vec.Word2Vec(sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=<built-in function hash>, epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=(), comment=None, max_final_vocab=None, shrink_windows=True)

Ver documentación en https://radimrehurek.com/gensim/models/word2vec.html

In [6]:
model.wv.most_similar_cosmul(positive=['king', 'woman'], negative=['man'])

[('lion', 0.7882232666015625),
 ('socratic', 0.787378191947937),
 ('honorable', 0.7861039042472839),
 ('celebrity', 0.7845058441162109),
 ('regarded', 0.7832763195037842),
 ('xavier', 0.7831804156303406),
 ('smartest', 0.7830394506454468),
 ('luther', 0.7829859852790833),
 ('ceo', 0.7815907001495361),
 ('mononoke', 0.7766878008842468)]

In [7]:
model.wv.similar_by_word('king')

[('fisher', 0.7423710227012634),
 ('lion', 0.7375896573066711),
 ('stephen', 0.722663164138794),
 ('luther', 0.7147964239120483),
 ('alysaria', 0.7123627662658691),
 ('requiem', 0.6916431784629822),
 ('mononoke', 0.6896265149116516),
 ('finaille', 0.6820234060287476),
 ('leon', 0.6773303747177124),
 ('musicbird', 0.6769998669624329)]

In [8]:
model.wv.doesnt_match(['king', 'george', 'stephen', 'truck'])

'truck'

In [9]:
model.save("word2vec.model")
model = Word2Vec.load("word2vec.model")

In [10]:
import numpy as np

vectors = np.asarray(model.wv.vectors)
labels = np.asarray(model.wv.index_to_key)

In [11]:
np.where(labels == 'king')

(array([1419]),)

In [12]:
vectors[1419]

array([ 0.07721531,  0.2530185 ,  0.3485773 ,  0.26570758,  0.2859869 ,
       -0.62155116, -0.15303221,  0.47236037, -0.40450406, -0.11892632,
        0.1200139 , -0.03759855, -0.09823629, -0.09281109, -0.09003191,
       -0.06645074,  0.61305594,  0.06378203, -0.00243269, -0.31246012,
       -0.2253394 , -0.03397318,  0.03545991,  0.13685219,  0.03085326,
        0.07861459, -0.27228394,  0.17512956, -0.06398501,  0.35049304,
        0.2309741 , -0.5586009 ,  0.6253171 , -0.50488853, -0.3432208 ,
        0.24192709,  0.15189749,  0.4331391 , -0.01036197, -0.4929012 ,
       -0.3580691 , -0.8313242 , -0.10620343, -0.04225687,  0.2635933 ,
       -0.47258556,  0.06311401, -0.57744884, -0.07666557,  0.1484837 ,
        0.17664823, -0.11870961,  0.6473127 ,  0.0667647 , -0.27785128,
        0.41333365,  0.07608798, -0.23419629, -0.36564538, -0.13073632,
        0.5992661 , -0.31244844,  0.16024995,  0.24952951, -0.6796528 ,
        0.3500746 ,  0.17183138,  0.33167225, -0.1363585 ,  0.15

In [13]:
score, results=model.wv.evaluate_word_analogies('questions-words.txt')

In [14]:
score

0.1314676504994151

Pretrained models

In [15]:
import gensim.downloader

print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [16]:
w2v_google = gensim.downloader.load('word2vec-google-news-300')

In [17]:
w2v_google.most_similar_cosmul(positive=['king', 'woman'], negative=['man'])

[('queen', 0.9314123392105103),
 ('monarch', 0.858533501625061),
 ('princess', 0.8476566076278687),
 ('Queen_Consort', 0.8150269985198975),
 ('queens', 0.8099815249443054),
 ('crown_prince', 0.8089976906776428),
 ('royal_palace', 0.8027306795120239),
 ('monarchy', 0.8019613027572632),
 ('prince', 0.800979733467102),
 ('empress', 0.7958389520645142)]

In [18]:
w2v_google.similar_by_word('king')

[('kings', 0.7138045430183411),
 ('queen', 0.6510956883430481),
 ('monarch', 0.6413194537162781),
 ('crown_prince', 0.6204220056533813),
 ('prince', 0.6159993410110474),
 ('sultan', 0.5864824056625366),
 ('ruler', 0.5797567367553711),
 ('princes', 0.5646552443504333),
 ('Prince_Paras', 0.5432944297790527),
 ('throne', 0.5422105193138123)]

In [19]:
w2v_google.doesnt_match(['king', 'george', 'stephen', 'truck'])

'truck'

In [20]:
score, results = w2v_google.evaluate_word_analogies('questions-words.txt')

In [21]:
score

0.7401448525607863