# Lesson 3

## Different word embeddings

In [4]:
import pandas as pd
df = pd.read_csv('amazon_cellphones.csv')
df

Unnamed: 0,reviewText
0,Looks even better in person. Be careful to not...
1,When you don't want to spend a whole lot of ca...
2,"so the case came on time, i love the design. I..."
3,DON'T CARE FOR IT. GAVE IT AS A GIFT AND THEY...
4,"I liked it because it was cute, but the studs ..."
...,...
99994,Bought this to use while running and it has do...
99995,"i wanted to run with my RAZR MAXX, so i bought..."
99996,"My phone slides in perfectly, and it fits my a..."
99997,"This armband /will/ fit a Razr Maxx, in an Ott..."


In [5]:
df.isna().sum()

reviewText    41
dtype: int64

### clean the document

In [6]:
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

corpus = df['reviewText'][~df['reviewText'].isna()]

corpus = corpus.str.lower().str.replace('[^\w\s]','')
corpus = corpus.str.split()
corpus = corpus.apply(lambda x: [lemmatizer.lemmatize(item) for item in x if item not in stop])
corpus = corpus.apply(lambda x: [item for item in x if not item.isnumeric()])
# corpus = [' '.join(i) for i in corpus]
corpus

  corpus = corpus.str.lower().str.replace('[^\w\s]','')


0        [look, even, better, person, careful, drop, ph...
1        [dont, want, spend, whole, lot, cash, want, gr...
2        [case, came, time, love, design, im, actually,...
3                 [dont, care, gave, gift, okay, expected]
4        [liked, cute, stud, fall, easily, protect, pho...
                               ...                        
99994    [bought, use, running, done, job, right, dont,...
99995    [wanted, run, razr, maxx, bought, fit, great, ...
99996    [phone, slide, perfectly, fit, arm, perfectly,...
99997    [armband, fit, razr, maxx, otterbox, commuter,...
99998    [look, great, picture, poorly, made, lot, flaw...
Name: reviewText, Length: 99958, dtype: object

### create ngrams

In [7]:
from gensim.models.phrases import Phrases

bigram = Phrases(corpus, min_count=5, threshold=0.2)
bigrams = [bigram[item] for item in corpus]
corpus = [bigram[item] for item in bigrams]

In [12]:
from gensim.models import Word2Vec

#sg: training algorithm. 1 for skipgram, 0 for cbow
%time w2v_model = Word2Vec(sg=0, sentences=corpus, vector_size=100, window=5, min_count=5, epochs=5)

Wall time: 9.51 s


In [9]:
help(Word2Vec)

Help on class Word2Vec in module gensim.models.word2vec:

class Word2Vec(gensim.utils.SaveLoad)
 |  Word2Vec(sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=<built-in function hash>, epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=(), comment=None, max_final_vocab=None)
 |  
 |  Serialize/deserialize objects from disk, by equipping them with the `save()` / `load()` methods.
 |  
 |  --------
 |  This uses pickle internally (among other techniques), so objects must not contain unpicklable attributes
 |  such as lambda functions etc.
 |  
 |  Method resolution order:
 |      Word2Vec
 |      gensim.utils.SaveLoad
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5

In [14]:
print(w2v_model)

Word2Vec(vocab=50212, vector_size=100, alpha=0.025)


### create embeddings with fastText

In [18]:
from gensim.models import FastText

%time ft_model = FastText(sg=1, sentences=corpus, vector_size=100, window=5, min_count=5, epochs=5, min_n=4, max_n=6)
# ft_model.build_vocab(sentences=corpus)

Wall time: 1min 4s


In [16]:
help(FastText)

Help on class FastText in module gensim.models.fasttext:

class FastText(gensim.models.word2vec.Word2Vec)
 |  FastText(sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, word_ngrams=1, sample=0.001, seed=1, workers=3, min_alpha=0.0001, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=<built-in function hash>, epochs=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=10000, callbacks=(), max_final_vocab=None)
 |  
 |  Serialize/deserialize objects from disk, by equipping them with the `save()` / `load()` methods.
 |  
 |  --------
 |  This uses pickle internally (among other techniques), so objects must not contain unpicklable attributes
 |  such as lambda functions etc.
 |  
 |  Method resolution order:
 |      FastText
 |      gensim.models.word2vec.Word2Vec
 |      gensim.utils.SaveLoad
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, sentenc

In [19]:
print(ft_model)

FastText(vocab=50212, vector_size=100, alpha=0.025)


In [20]:
from gensim.models import KeyedVectors 
w2v_model.wv.save_word2vec_format('w2v_model.bin', binary=True)
ft_model.wv.save_word2vec_format('ft_model.bin', binary=True)

In [21]:
# the end