In [None]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
import pandas as pd

In [None]:
df = pd.DataFrame({'text':newsgroups_train.data, 'target':newsgroups_train.target})

In [None]:
df.loc[0,'text']

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [None]:
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import multiprocessing
cores = multiprocessing.cpu_count()

### Word2vec Example 

![](https://github.com/sergeyfitts/nlp_hse/blob/master/week7/params.png?raw=1)

In [None]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [None]:
w2v_model.build_vocab([i.split() for i in df.text], progress_per=10000)
w2v_model.train([i.split() for i in df.text], total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
w2v_model.init_sims(replace=True)

In [None]:
w2v_model.wv.most_similar(positive=["machine"])

[('running', 0.6741992831230164),
 ('locks', 0.6659230589866638),
 ('installed.', 0.651772141456604),
 ('unix', 0.6476942300796509),
 ('installed,', 0.6316647529602051),
 ('machine,', 0.6204047799110413),
 ('keyseach', 0.6066868305206299),
 ('running,', 0.5999594926834106),
 ('store.', 0.5720880031585693),
 ('hangs', 0.5705453157424927)]

In [None]:
len(w2v_model.wv['machine'])

300

In [None]:
# king-man+woman=queen
w2v_model.wv.most_similar(positive=["king","woman"], negative=["man"])

[('raped', 0.7218968272209167),
 ('father,', 0.7074660062789917),
 ('Lyuda', 0.7062985897064209),
 ('brothers', 0.6849087476730347),
 ('woman,', 0.6822104454040527),
 ('Karina', 0.679212749004364),
 ('apartment.', 0.6756207942962646),
 ('apartment,', 0.6754775643348694),
 ('her.', 0.6744227409362793),
 ('promptly', 0.6734017133712769)]

[intersting source](https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial)

### Doc2vec Example

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
data = ["I love machine learning. Its awesome.",
        "I love coding in python",
        "I love building chatbots",
        "they chat amagingly well"]
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

dm defines the training algorithm. If dm=1 means ‘distributed memory’ (PV-DM) and dm =0 means ‘distributed bag of words’ (PV-DBOW). Distributed Memory model preserves the word order in a document whereas Distributed Bag of words just uses the bag of words approach, which doesn’t preserve any word order.

In [None]:
max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm=1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha



iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

In [None]:
model.infer_vector("I love chatbots".lower().split())

array([ 0.00141212,  0.01647103, -0.00027063, -0.02029978,  0.00365317,
        0.0046502 ,  0.03020738,  0.02369142, -0.0048711 ,  0.00185603,
        0.00405063, -0.01471705,  0.01511542,  0.01226789,  0.00571584,
        0.01378375,  0.01134647, -0.02424124,  0.02237709,  0.00783969],
      dtype=float32)

[worth visiting](https://medium.com/@mishra.thedeepak/doc2vec-simple-implementation-example-df2afbbfbad5)

### Problem

---
1) векторизовать тексты с помощью tf-idf <br>
2) векторизовать тексты с помощью word2vec <br>
3) обучить классификатор (любой, например knn) <br>
4) сравнить качество, и ответить на вопрос, улучшает ли векоризация качество модели