# gensim

In [None]:
!pip install gensim

In [None]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.decomposition import PCA

from gensim.test.utils import datapath
from gensim.models import KeyedVectors


### load in pretrained word2vec model

In [None]:
word2vec = datapath('/Users/mwasserman/programming/projects/spacy_lecture/word2vec_100d.txt')
model = KeyedVectors.load_word2vec_format(word2vec)

##### word similarity 

In [None]:
model.most_similar('obama')

In [None]:
model.most_similar('banana')

In [None]:
model.most_similar(negative='banana')

<img src="male_female.png">

In [None]:
result = model.most_similar(positive=['woman', 'king'], negative=['man'])
print("{}: {:.4f}".format(*result[0]))

### analogy

In [None]:
def analogy(x1, x2, y1):
    result = model.most_similar(positive=[y1, x2], negative=[x1])
    return result

In [None]:
analogy('japan', 'japanese', 'australia')

In [None]:
analogy('australia', 'beer', 'france')

In [None]:
analogy('obama', 'clinton', 'reagan')

In [None]:
analogy('tall', 'tallest', 'long')

In [None]:
analogy('good', 'fantastic', 'bad')

### which word doesnt belong

In [None]:
print(model.doesnt_match("breakfast cereal dinner lunch".split()))

### pca of vector space

In [None]:
def display_pca_scatterplot(model, words=None, sample=0):
    if words == None:
        if sample > 0:
            words = np.random.choice(list(model.vocab.keys()), sample)
        else:
            words = [ word for word in model.vocab ]
        
    word_vectors = np.array([model[w] for w in words])

    twodim = PCA().fit_transform(word_vectors)[:,:2]
    
    plt.figure(figsize=(6,6))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.05, y+0.05, word)

In [None]:
display_pca_scatterplot(model, 
                        ['coffee', 'tea', 'beer', 'wine', 'brandy', 'rum', 'champagne', 'water',
                         'spaghetti', 'borscht', 'hamburger', 'pizza', 'falafel', 'sushi', 'meatballs',
                         'dog', 'horse', 'cat', 'monkey', 'parrot', 'koala', 'lizard',
                         'frog', 'toad', 'monkey', 'ape', 'kangaroo', 'wombat', 'wolf',
                         'france', 'germany', 'hungary', 'luxembourg', 'australia', 'fiji', 'china',
                         'homework', 'assignment', 'problem', 'exam', 'test', 'class',
                         'school', 'college', 'university', 'institute'])

# spaCy

In [None]:
!pip install spacy

In [None]:
!pip install tqdm

In [None]:
!python -m spacy download en_core_web_md

In [None]:
import spacy
import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
tqdm.pandas()

In [None]:
df = pd.read_csv("yelp.csv")

In [None]:
df.head()

In [None]:
df.shape

## spaCy Pipeline

In [None]:
nlp = spacy.load('en_core_web_md')

In [None]:
df['spacy'] = df.text.progress_apply(lambda x: nlp(x))

#### each document in the corpus is a doc object containing individual token objects

In [None]:
doc = df.spacy[0]
type(doc)

In [None]:
type(doc[0])

In [None]:
doc[0:22]

#### check if stop word

In [None]:
#weod is me
doc[3].is_stop

#### part of speach tagging

In [None]:
doc[3].pos_

#### vector from loaded language model

In [None]:
#stop words HAVE a vector representation
doc[3].vector

#### vector representation of document (avg of all word vectors in doc)

In [None]:
doc.vector

#### sentance splitting in document

In [None]:
[sents for sents in doc.sents]

#### document similarity

In [None]:
doc_2 = df.spacy[1]
doc.similarity(doc_2)

### see documentation for all you can do
* https://spacy.io/api/token
* https://spacy.io/api/doc