In [90]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt

In [91]:
md=pd.read_excel("Milliyetdata.xlsx")
zd=pd.read_excel("Zaytungdata.xlsx")
X=pd.concat([md,zd])

In [92]:
X.head()
y = X['label'].values

In [93]:
from gensim.models import doc2vec

def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the complaint narrative.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.TaggedDocument(v.split(), [label]))
    return labeled

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X['Basliklar'], y, test_size=0.33,random_state=0)
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

In [95]:
all_data

[TaggedDocument(words=["Türkiye'nin", 'En', 'Güzel', 'Dayak', 'Yenilebilecek', '10', 'Yöresi'], tags=['Train_0']),
 TaggedDocument(words=['Son', 'dakika...', 'Kılıçdaroğlu', 've', 'Karamollaoğlu', 'görüşmesi', '1', 'saat', 'sürdü'], tags=['Train_1']),
 TaggedDocument(words=['Konuşulmayan', 'Bir', 'Yara:', 'Erkeklerde', 'Regl', 'Dönemi', 've', 'Sancıları'], tags=['Train_2']),
 TaggedDocument(words=['Milliyetçi', "Zühtü'den,", 'Hastayız', 'Sana', 'Cem', "Uzan'a...", 'Türk', 'Siyasi', 'Tarihinde', 'Öyle', 'ya', 'da', 'Böyle', 'İz', 'Bırakmış', 'Seçim', 'Şarkıları'], tags=['Train_3']),
 TaggedDocument(words=['Bakan', 'Soylu:', '8', 'bin', '526', "YTS'nin", "Türkiye'ye", 'girişi', 'engellendi'], tags=['Train_4']),
 TaggedDocument(words=['Son', 'dakika:', "Sivas'ta", 'heyecanlandıran', 'görüntü!', "3'ü", 'aynı', 'anda', 'görüntülendi'], tags=['Train_5']),
 TaggedDocument(words=['Eski', 'Başbakan', 'Tansu', "Çiller'den", 'yalanlama', 'geldi'], tags=['Train_6']),
 TaggedDocument(words=['Talep'

In [96]:
len(all_data)

571

In [97]:
all_data[:2]

[TaggedDocument(words=["Türkiye'nin", 'En', 'Güzel', 'Dayak', 'Yenilebilecek', '10', 'Yöresi'], tags=['Train_0']),
 TaggedDocument(words=['Son', 'dakika...', 'Kılıçdaroğlu', 've', 'Karamollaoğlu', 'görüşmesi', '1', 'saat', 'sürdü'], tags=['Train_1'])]

In [98]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

100%|██████████| 571/571 [00:00<00:00, 635939.35it/s]


In [99]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 571/571 [00:00<00:00, 688005.63it/s]
100%|██████████| 571/571 [00:00<00:00, 680576.18it/s]
100%|██████████| 571/571 [00:00<00:00, 550056.86it/s]
100%|██████████| 571/571 [00:00<00:00, 692982.52it/s]
100%|██████████| 571/571 [00:00<00:00, 1083686.69it/s]
100%|██████████| 571/571 [00:00<00:00, 1189740.48it/s]
100%|██████████| 571/571 [00:00<00:00, 1038122.06it/s]
100%|██████████| 571/571 [00:00<00:00, 1098096.10it/s]
100%|██████████| 571/571 [00:00<00:00, 1160342.82it/s]
100%|██████████| 571/571 [00:00<00:00, 1076864.92it/s]
100%|██████████| 571/571 [00:00<00:00, 465399.84it/s]
100%|██████████| 571/571 [00:00<00:00, 867734.63it/s]
100%|██████████| 571/571 [00:00<00:00, 1108772.03it/s]
100%|██████████| 571/571 [00:00<00:00, 510758.71it/s]
100%|██████████| 571/571 [00:00<00:00, 705849.57it/s]
100%|██████████| 571/571 [00:00<00:00, 945125.33it/s]
100%|██████████| 571/571 [00:00<00:00, 257937.27it/s]
100%|██████████| 571/571 [00:00<00:00, 733296.87it/s]
100%|██████████| 571/

CPU times: user 1.28 s, sys: 104 ms, total: 1.38 s
Wall time: 1.36 s


In [100]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

In [101]:
train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')

In [102]:
train_vectors_dbow

array([[ 0.018843  , -0.01812262, -0.07639009, ...,  0.02901048,
         0.17719154,  0.0171526 ],
       [ 0.01546618, -0.0122502 , -0.06078761, ...,  0.03007164,
         0.1415481 ,  0.01028639],
       [ 0.01882833, -0.02040868, -0.0823302 , ...,  0.03604029,
         0.1875869 ,  0.02652048],
       ...,
       [ 0.02414415, -0.02694131, -0.10801335, ...,  0.04572523,
         0.2490195 ,  0.02713842],
       [ 0.01469331, -0.02067408, -0.08500143, ...,  0.03447942,
         0.19175951,  0.0191939 ],
       [ 0.03875469, -0.03060996, -0.12518758, ...,  0.0475631 ,
         0.28686318,  0.03196498]])

In [103]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(multi_class='multinomial', solver = 'lbfgs')
logreg.fit(train_vectors_dbow, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [104]:
logreg.score(test_vectors_dbow, y_test)

0.783068783068783

In [105]:
model_dbow.save('d2v_model_dbow.doc2vec')

Distributed Memory

Distributed Memory (DM) acts as a memory that remembers what is missing from the current context — or as the topic of the paragraph. While the word vectors represent the concept of a word, the document vector intends to represent the concept of a document.
We again instantiate a Doc2Vec model with a vector size with 100 words and iterating over the training corpus 30 times.

In [106]:
model_dm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dm.build_vocab([x for x in tqdm(all_data)])

100%|██████████| 571/571 [00:00<00:00, 685445.79it/s]


In [107]:
%%time
for epoch in range(30):
    model_dm.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dm.alpha -= 0.002
    model_dm.min_alpha = model_dm.alpha

100%|██████████| 571/571 [00:00<00:00, 532565.62it/s]
100%|██████████| 571/571 [00:00<00:00, 722022.18it/s]
100%|██████████| 571/571 [00:00<00:00, 1064894.43it/s]
100%|██████████| 571/571 [00:00<00:00, 1086144.03it/s]
100%|██████████| 571/571 [00:00<00:00, 574327.96it/s]
100%|██████████| 571/571 [00:00<00:00, 615984.46it/s]
100%|██████████| 571/571 [00:00<00:00, 1337212.50it/s]
100%|██████████| 571/571 [00:00<00:00, 725302.12it/s]
100%|██████████| 571/571 [00:00<00:00, 1023919.45it/s]
100%|██████████| 571/571 [00:00<00:00, 1050876.52it/s]
100%|██████████| 571/571 [00:00<00:00, 923976.69it/s]
100%|██████████| 571/571 [00:00<00:00, 1079777.99it/s]
100%|██████████| 571/571 [00:00<00:00, 921133.69it/s]
100%|██████████| 571/571 [00:00<00:00, 874387.58it/s]
100%|██████████| 571/571 [00:00<00:00, 498739.61it/s]
100%|██████████| 571/571 [00:00<00:00, 1236421.06it/s]
100%|██████████| 571/571 [00:00<00:00, 481009.76it/s]
100%|██████████| 571/571 [00:00<00:00, 1274586.26it/s]
100%|██████████| 571

CPU times: user 1.53 s, sys: 107 ms, total: 1.64 s
Wall time: 1.64 s





In [108]:
train_vectors_dm = get_vectors(model_dm, len(X_train), 300, 'Train')
test_vectors_dm = get_vectors(model_dm, len(X_test), 300, 'Test')

In [109]:
logreg.fit(train_vectors_dm, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [110]:
logreg.score(test_vectors_dm, y_test)

0.6613756613756614

In [111]:
model_dm.save('d2v_model_dm.doc2vec')

In [112]:
# model_dbow = Doc2Vec.load('d2v_model_dbow.doc2vec')
# model_dm = Doc2Vec.load('d2v_model_dm.doc2vec')
model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_dm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [113]:
def get_concat_vectors(model1,model2, corpus_size, vectors_size, vectors_type):
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = np.append(model1.docvecs[prefix],model2.docvecs[prefix])
    return vectors

In [114]:
train_vecs_dbow_dm = get_concat_vectors(model_dbow,model_dm, len(X_train), 600, 'Train')
test_vecs_dbow_dm = get_concat_vectors(model_dbow,model_dm, len(X_test), 600, 'Test')

In [115]:
%%time
logreg = LogisticRegression()
logreg.fit(train_vecs_dbow_dm, y_train)

CPU times: user 18.2 ms, sys: 2.92 ms, total: 21.2 ms
Wall time: 20.3 ms




In [116]:
logreg.score(test_vecs_dbow_dm, y_test)

0.7936507936507936

In [123]:
model_dbow.docvecs.similarity("Train_115",'Kiminle')

  if np.issubdtype(vec.dtype, np.int):


KeyError: "tag 'Kiminle' not seen in training corpus/invalid"