# Data Load

In [1]:
import config
from pshmodule.utils import filemanager as fm

In [2]:
df = fm.load(config.temp_data)

extension : .pickle
Loaded 9150 records from /Volumes/GoogleDrive/내 드라이브/MemeProject/data/temp_for_doc2vec.pickle


In [3]:
df.head(20)

1,content,pos
0,나 중간고사 반에서 1등했어,"[나, 중간고사, 반]"
1,중간고사 점수 내가 반에서 제일 잘 받음,"[중간고사, 점수, 내, 반, 제일, 받음]"
2,나 반에서 중간 성적 제일 좋아,"[나, 반, 중간, 성적, 제일, 좋아]"
3,우리 반에서 내가 시험 제일 잘 봤다,"[우리, 반, 내, 시험, 제일, 봤다]"
4,중간고사 반 1등 먹음,"[중간고사, 반, 먹음]"
5,중간시험 내가 반 1등이야,"[중간, 시험, 내, 반]"
6,너 왜 이렇게 멍청하냐?,"[너, 왜, 이렇게, 멍청하냐]"
7,이 새끼 빡대가리잖아?,"[새끼, 빡, 대가리, 잖아]"
8,너 말귀 못 알아듣냐고,"[너, 말귀, 못, 알아듣냐고]"
9,병신 이해력 딸리네,"[병신, 이해력, 딸리네]"


# 

# word2vec

In [4]:
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

### loss logger

In [5]:
class LossLogger(CallbackAny2Vec):
    '''Output loss at each epoch'''
    def __init__(self):
        self.epoch = 1
        self.losses = []

    def on_epoch_begin(self, model):
        print(f'Epoch: {self.epoch}', end='\t')

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        # self.losses.append(loss)
        print(f'  Loss: {loss}')
        self.epoch += 1

### train

In [6]:
print('\tPreparing model with the following parameters: epochs = {}, vector_size = {}, alpha = {} .. '.
      format(config.max_epochs, config.vec_size, config.alpha))

model = Word2Vec(sentences=df.pos,
                 vector_size=config.vec_size,
                 window=config.window,
                 min_count=config.min_count,
                 workers=config.workers,
                 epochs=config.max_epochs,
                 sg = 1) # 0: CBOW, 1: skip-gram

model.build_vocab(df.pos)

	Preparing model with the following parameters: epochs = 100, vector_size = 1024, alpha = 0.01 .. 


In [7]:
model.train(df.pos,
            epochs=model.epochs,
            total_examples=len(df.pos),
            compute_loss=True,
            callbacks=[LossLogger()]
           )

Epoch: 1	  Loss: 129132.78125
Epoch: 2	  Loss: 239714.171875
Epoch: 3	  Loss: 341254.0
Epoch: 4	  Loss: 435742.34375
Epoch: 5	  Loss: 521429.34375
Epoch: 6	  Loss: 602579.9375
Epoch: 7	  Loss: 677772.1875
Epoch: 8	  Loss: 745428.0625
Epoch: 9	  Loss: 811287.0625
Epoch: 10	  Loss: 872780.5625
Epoch: 11	  Loss: 930065.375
Epoch: 12	  Loss: 984124.8125
Epoch: 13	  Loss: 1035121.75
Epoch: 14	  Loss: 1082433.0
Epoch: 15	  Loss: 1126924.375
Epoch: 16	  Loss: 1169576.5
Epoch: 17	  Loss: 1210505.0
Epoch: 18	  Loss: 1249918.125
Epoch: 19	  Loss: 1287432.5
Epoch: 20	  Loss: 1323972.25
Epoch: 21	  Loss: 1359360.125
Epoch: 22	  Loss: 1393396.875
Epoch: 23	  Loss: 1426436.875
Epoch: 24	  Loss: 1459153.625
Epoch: 25	  Loss: 1490963.75
Epoch: 26	  Loss: 1522588.25
Epoch: 27	  Loss: 1553817.375
Epoch: 28	  Loss: 1583895.25
Epoch: 29	  Loss: 1613316.125
Epoch: 30	  Loss: 1642466.375
Epoch: 31	  Loss: 1671546.25
Epoch: 32	  Loss: 1700002.875
Epoch: 33	  Loss: 1728190.0
Epoch: 34	  Loss: 1755660.875
Epoc

(4196574, 4601800)

# 

# model save

In [8]:
model.save(config.word2vec)
print("Model Saved")

Model Saved


# 