# Data Load

In [1]:
import config
from pshmodule.utils import filemanager as fm

In [2]:
df = fm.load(config.temp_data)

extension : .pickle
Loaded 9150 records from /Volumes/GoogleDrive/내 드라이브/MemeProject/data/temp_for_doc2vec.pickle


In [3]:
df.head(20)

1,content,pos
0,나 중간고사 반에서 1등했어,"[나, 중간고사, 반]"
1,중간고사 점수 내가 반에서 제일 잘 받음,"[중간고사, 점수, 내, 반, 제일, 받음]"
2,나 반에서 중간 성적 제일 좋아,"[나, 반, 중간, 성적, 제일, 좋아]"
3,우리 반에서 내가 시험 제일 잘 봤다,"[우리, 반, 내, 시험, 제일, 봤다]"
4,중간고사 반 1등 먹음,"[중간고사, 반, 먹음]"
5,중간시험 내가 반 1등이야,"[중간, 시험, 내, 반]"
6,너 왜 이렇게 멍청하냐?,"[너, 왜, 이렇게, 멍청하냐]"
7,이 새끼 빡대가리잖아?,"[새끼, 빡, 대가리, 잖아]"
8,너 말귀 못 알아듣냐고,"[너, 말귀, 못, 알아듣냐고]"
9,병신 이해력 딸리네,"[병신, 이해력, 딸리네]"


# 

# doc2vec

In [4]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.callbacks import CallbackAny2Vec
from gensim.test.utils import common_texts

### loss logger

In [5]:
class LossLogger(CallbackAny2Vec):
    '''Output loss at each epoch'''
    def __init__(self):
        self.epoch = 1
        self.losses = []

    def on_epoch_begin(self, model):
        print(f'Epoch: {self.epoch}', end='\t')

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        self.losses.append(loss)
        print(f'  Loss: {loss}')
        self.epoch += 1

In [6]:
print('\tTagging data .. ')
tagged_data = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(df.pos)]

print('\tPreparing model with the following parameters: epochs = {}, vector_size = {}, alpha = {} .. '.
      format(config.max_epochs, config.vec_size, config.alpha))

model = Doc2Vec(vector_size=config.vec_size,
                window=config.window,
                workers=config.workers,
                alpha=config.alpha,  # initial learning rate
                epochs=config.max_epochs, # epoch
                min_count=config.min_count,  # Ignore words having a total frequency below this
               )

model.build_vocab(tagged_data)

	Tagging data .. 
	Preparing model with the following parameters: epochs = 200, vector_size = 1024, alpha = 0.01 .. 


In [7]:
config.vec_size

1024

In [8]:
model.train(tagged_data,
            epochs=model.epochs,
            total_examples=len(tagged_data),
            compute_loss=True,
            callbacks=[LossLogger()]
           )

Epoch: 1	  Loss: 0.0
Epoch: 2	  Loss: 0.0
Epoch: 3	  Loss: 0.0
Epoch: 4	  Loss: 0.0
Epoch: 5	  Loss: 0.0
Epoch: 6	  Loss: 0.0
Epoch: 7	  Loss: 0.0
Epoch: 8	  Loss: 0.0
Epoch: 9	  Loss: 0.0
Epoch: 10	  Loss: 0.0
Epoch: 11	  Loss: 0.0
Epoch: 12	  Loss: 0.0
Epoch: 13	  Loss: 0.0
Epoch: 14	  Loss: 0.0
Epoch: 15	  Loss: 0.0
Epoch: 16	  Loss: 0.0
Epoch: 17	  Loss: 0.0
Epoch: 18	  Loss: 0.0
Epoch: 19	  Loss: 0.0
Epoch: 20	  Loss: 0.0
Epoch: 21	  Loss: 0.0
Epoch: 22	  Loss: 0.0
Epoch: 23	  Loss: 0.0
Epoch: 24	  Loss: 0.0
Epoch: 25	  Loss: 0.0
Epoch: 26	  Loss: 0.0
Epoch: 27	  Loss: 0.0
Epoch: 28	  Loss: 0.0
Epoch: 29	  Loss: 0.0
Epoch: 30	  Loss: 0.0
Epoch: 31	  Loss: 0.0
Epoch: 32	  Loss: 0.0
Epoch: 33	  Loss: 0.0
Epoch: 34	  Loss: 0.0
Epoch: 35	  Loss: 0.0
Epoch: 36	  Loss: 0.0
Epoch: 37	  Loss: 0.0
Epoch: 38	  Loss: 0.0
Epoch: 39	  Loss: 0.0
Epoch: 40	  Loss: 0.0
Epoch: 41	  Loss: 0.0
Epoch: 42	  Loss: 0.0
Epoch: 43	  Loss: 0.0
Epoch: 44	  Loss: 0.0
Epoch: 45	  Loss: 0.0
Epoch: 46	  Loss: 0

### model save

In [9]:
model.save(config.doc2vec)
print("Model Saved")

Model Saved


# 