In [17]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
import os
import jieba
import matplotlib.pyplot as plt
from glob import glob


from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot, text_to_word_sequence, Tokenizer
from keras.models import Sequential,Model,load_model
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Dropout
from keras.callbacks import EarlyStopping,ModelCheckpoint
import keras.backend as K


In [2]:
# myQuery = '''
# SELECT * FROM pushs
# '''

# db_name = 'ptt_nba.db'
# path=os.path.join('..','crawler',db_name)
# url = 'sqlite:///%s'%(path)
# print(path)

# engine = create_engine(url)

# pushs = pd.read_sql_query(myQuery, engine)

def get_table(db_name='ptt_gossiping.db',table_name='pushs'):
    myQuery = '''
    SELECT * FROM %s
    '''%(table_name)
    
    path=os.path.join('..','crawler',db_name)
    url = 'sqlite:///%s'%(path)

    engine = create_engine(url)

    pushs = pd.read_sql_query(myQuery, engine)
    return pushs

pushs=get_table(db_name='ptt_gossiping.db',table_name='pushs')
pushs.head()

Unnamed: 0,id,article_id,push,user,content,ip,datetime
0,1,1561970056,→,Sougetu,這智商真的很高,36.227.69.25,2019-07-01 16:34:00.000000
1,2,1561970056,→,Rrrxddd,隱+30,126.35.31.169,2019-07-01 16:34:00.000000
2,3,1561970056,→,bonfferoni,八卦-1,27.52.2.223,2019-07-01 16:34:00.000000
3,4,1561970056,推,ebod221,北七,111.249.251.194,2019-07-01 16:35:00.000000
4,5,1561970056,推,yaritai,+1,42.73.134.151,2019-07-01 16:35:00.000000


In [3]:
print(pushs.shape)

(834960, 7)


In [4]:
def train_or_test(x):
    if x in ['推','噓']:
        return 'train'
    else:
        return 'test'

def get_push_value(x):
    if x == '推': return 1
    elif x == '噓': return 0
    else: return 'x'
    
    
# pushs['train_or_test'] = pushs['push'].apply(lambda x: train_or_test(x))
# pushs['push_value'] = pushs['push'].apply(lambda x: get_push_value(x))
# pushs = pushs[['train_or_test','content','push_value']]
# pushs

def process_pushs(pushs):
    pushs['train_or_test'] = pushs['push'].apply(lambda x: train_or_test(x))
    pushs['push_value'] = pushs['push'].apply(lambda x: get_push_value(x))
    columns=['id', 'article_id', 'push', 'user','ip', 'datetime', 'train_or_test', 'push_value', 'content']
    pushs = pushs[columns]
    return pushs

pushs_process=process_pushs(pushs)
pushs_process.head()


Unnamed: 0,id,article_id,push,user,ip,datetime,train_or_test,push_value,content
0,1,1561970056,→,Sougetu,36.227.69.25,2019-07-01 16:34:00.000000,test,x,這智商真的很高
1,2,1561970056,→,Rrrxddd,126.35.31.169,2019-07-01 16:34:00.000000,test,x,隱+30
2,3,1561970056,→,bonfferoni,27.52.2.223,2019-07-01 16:34:00.000000,test,x,八卦-1
3,4,1561970056,推,ebod221,111.249.251.194,2019-07-01 16:35:00.000000,train,1,北七
4,5,1561970056,推,yaritai,42.73.134.151,2019-07-01 16:35:00.000000,train,1,+1


In [5]:
# jieba_dit_path = os.path.join('.','dataset','dict.txt.big')
# jieba.set_dictionary(jieba_dit_path)

def set_jieba_dictionary():
    jieba_dit_path = os.path.join('.','dataset','dict.txt.big')
    jieba.set_dictionary(jieba_dit_path) 

set_jieba_dictionary()

In [6]:
# pushs['content_cut'] = pushs['content'].apply(lambda x: jieba.lcut(x))
# pushs['content_cut_join'] = pushs['content_cut'].apply(lambda x: ' '.join(x))
# pushs

def cut_content_by_jieba(pushs):
    pushs = pushs.copy()
    pushs['content_cut'] = pushs['content'].apply(lambda x: jieba.lcut(x))
    pushs['content_cut_join'] = pushs['content_cut'].apply(lambda x: ' '.join(x))
    return pushs

pushs_cut = cut_content_by_jieba(pushs_process)
pushs_cut.head()

Building prefix dict from /media/disk3/feynman52/See26/model/dataset/dict.txt.big ...
Loading model from cache /tmp/jieba.uc8f590617510cc546ef7e6a3a5db2cd4.cache
Loading model cost 1.543 seconds.
Prefix dict has been built succesfully.


Unnamed: 0,id,article_id,push,user,ip,datetime,train_or_test,push_value,content,content_cut,content_cut_join
0,1,1561970056,→,Sougetu,36.227.69.25,2019-07-01 16:34:00.000000,test,x,這智商真的很高,"[這, 智商, 真的, 很, 高]",這 智商 真的 很 高
1,2,1561970056,→,Rrrxddd,126.35.31.169,2019-07-01 16:34:00.000000,test,x,隱+30,"[隱, +, 30]",隱 + 30
2,3,1561970056,→,bonfferoni,27.52.2.223,2019-07-01 16:34:00.000000,test,x,八卦-1,"[八卦, -, 1]",八卦 - 1
3,4,1561970056,推,ebod221,111.249.251.194,2019-07-01 16:35:00.000000,train,1,北七,[北七],北七
4,5,1561970056,推,yaritai,42.73.134.151,2019-07-01 16:35:00.000000,train,1,+1,"[+, 1]",+ 1


In [7]:
def get_pushs_all_train_test(pushs_cut):
    pushs_all=pushs_cut
    pushs_train=pushs_cut[pushs_cut.train_or_test=='train'].copy().reset_index(drop=True)
    pushs_test=pushs_cut[pushs_cut.train_or_test=='test'].copy().reset_index(drop=True)
    return pushs_all,pushs_train,pushs_test

pushs_all,pushs_train,pushs_test=get_pushs_all_train_test(pushs_cut)

In [8]:
# num_words = 20000
# tok = Tokenizer(num_words=num_words,
#                 filters='"!#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
#                 lower=True,
#                 split=' ', 
#                 char_level=False)

# tok.fit_on_texts(pushs_cut.content_cut_join)

def train_tokenizer(pushs_cut,num_words=20000):
    tok = Tokenizer(num_words=num_words,
                    filters='"!#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                    lower=True,
                    split=' ', 
                    char_level=False)

    tok.fit_on_texts(pushs_cut.content_cut_join)
    return tok


num_words=50000
tok = train_tokenizer(pushs_all,num_words = num_words)

In [9]:
# tk = Tokenizer()
# texts = ["a b", "a c d a"]
# tk.fit_on_texts(texts)
# tk.num_words = 5

# print(tk.word_counts)
# print(tk.word_index)
# print(tk.texts_to_sequences(texts))


In [10]:
word_freq = pd.DataFrame({'word':list(tok.word_index.keys()),'freq':list(tok.word_index.values())})
word_freq = word_freq.sort_values(by='freq',ascending=True)
word_freq.tail()


Unnamed: 0,word,freq
155208,vu,155209
155209,這不把,155210
155210,簫查,155211
155211,衰死,155212
155212,女酸宅,155213


In [11]:
# x_train = tok.texts_to_sequences(pushs_cut.content_cut_join)
# maxlen = 50
# x_train = pad_sequences(x_train, maxlen=maxlen, padding='pre')
# x_train = np.array(x_train)
# x_train.shape
# x_train[0][-10:]

def get_xtrain_ytrain_xtest(pushs_train,pushs_test,maxlen= 50):    
    x_train = tok.texts_to_sequences(pushs_train.content_cut_join)
    x_train = pad_sequences(x_train, maxlen=maxlen, padding='pre')
    x_train = np.array(x_train)
    
    y_train = np.array(pushs_train.push_value)
    
    x_test = tok.texts_to_sequences(pushs_test.content_cut_join)
    x_test = pad_sequences(x_test, maxlen=maxlen, padding='pre')
    x_test = np.array(x_test)
    
    return x_train,y_train,x_test

maxlen= 50
x_train,y_train,x_test = get_xtrain_ytrain_xtest(pushs_train,pushs_test,maxlen= maxlen)
x_train.shape,y_train.shape,x_test.shape

((590211, 50), (590211,), (244749, 50))

In [12]:
# maxlen = 50
# sentences=['欠 噓','推']
# sentences=tok.texts_to_sequences(sentences)
# sentences=pad_sequences(sentences, maxlen=maxlen, padding='pre')
# sentences = np.array(sentences)
# sentences.shape

def get_sentences(sentences=['欠 噓','推']):
    maxlen = 50
    sentences=tok.texts_to_sequences(sentences)
    sentences=pad_sequences(sentences, maxlen=maxlen, padding='pre')
    sentences = np.array(sentences)
    return sentences

sentences=get_sentences(sentences=['欠 噓','推'])
sentences.shape

(2, 50)

In [13]:
i=20
print(pushs_train.content_cut_join[i])
print(x_train[i])
word_freq[word_freq.word.isin(pushs_train.content_cut_join[i].split(' '))] # G5 => g5


一邊 造神   一邊 造魔   草包 行徑
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0  931 2045  931  326 9790]


Unnamed: 0,word,freq
325,草包,326
930,一邊,931
2044,造神,2045
9789,行徑,9790
72390,造魔,72391


In [14]:
num_words,maxlen

(50000, 50)

In [15]:
# # https://github.com/keras-team/keras/blob/master/keras/layers/embeddings.py

# # (3353, 50) => (3353, 50, 200000) => (3353, 50, 128) => (3353, 1)
# # (embedding                                        )            
# model = Sequential()
# model.add(Embedding(input_dim=num_words, output_dim=128))
# model.add(LSTM(128, dropout=0.5, recurrent_dropout=0.2))
# model.add(Dense(1, activation='sigmoid'))

# model.summary()

def get_model(num_words,maxlen):    
    model = Sequential()
    model.add(Embedding(input_dim=num_words, output_dim=100, input_length=maxlen))
    
    
#     model.add(LSTM(128,dropout=0.5, recurrent_dropout=0.5,return_sequences=True))
#     model.add(LSTM(64,dropout=0.5, recurrent_dropout=0.5,return_sequences=False))
#     model.add(Dense(100,activation='relu'))
#     model.add(Dropout(0.5))
    
    model.add(SpatialDropout1D(0.4))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.summary()
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

model=get_model(num_words,maxlen)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 100)           5000000   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 50, 100)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 51        
Total params: 5,085,501
Trainable params: 5,085,501
Non-trainable params: 0
_________________________________________________________________


In [None]:
# early_stopping = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=10, min_delta=0.001)

# batch_size=50
# epochs=100

# history = model.fit(x_train, y_train,
#                   batch_size=batch_size,
#                   epochs=epochs,
#                   validation_split=0.25,
#                     callbacks=[early_stopping])

def get_history_and_train_model():
    early_stopping = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=10, min_delta=0.001)
    
    file_name = 'sentiment'
    file_name += '-'+"epoch_{epoch:02d}-val_acc_{val_acc:.3f}.hdf5"
    path=os.path.join('.','dataset','weights',file_name)
    checkpointer = ModelCheckpoint(filepath=path, verbose=1, monitor='val_acc', save_best_only=True, mode='max', period=1)
    
    batch_size=1000
    epochs=100

    history = model.fit(x_train, y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        shuffle=True,
                        validation_split=0.1,
                        callbacks=[early_stopping,checkpointer])
    
    return history

history=get_history_and_train_model()

In [None]:
# plt.plot(history.history['acc'])
# plt.plot(history.history['val_acc'])
# plt.title('LSTM', fontsize=30)
# plt.ylabel('acc', fontsize=20)
# plt.xlabel('epoch', fontsize=20)
# plt.legend(['train', 'val'], loc='upper right')
# plt.show()

def plot_history(history):
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('LSTM', fontsize=30)
    plt.ylabel('acc', fontsize=30)
    plt.xlabel('epoch', fontsize=20)
    plt.legend(['train', 'val'], loc='upper right')
    plt.show()
    
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('LSTM', fontsize=30)
    plt.ylabel('cross entropy', fontsize=30)
    plt.xlabel('epoch', fontsize=20)
    plt.legend(['train', 'val'], loc='upper right')
    plt.show()
    
plot_history(history)

- load best model

In [18]:
def get_acc(path):
    left=path.index('acc_')+len('acc_')
    right=path.index('.hdf5')
    acc=path[left:right]
    return acc

def load_best_model(file_name = 'sentiment'):
    file_name+='*'
    path=os.path.join('.','dataset','weights',file_name)
    path_list=glob(path)
    
    path_list=[[float(get_acc(path)),path] for path in path_list]
    path_list.sort()
    path=path_list[0][1]
    
    best_model=load_model(path)
    return best_model

best_model=load_best_model(file_name = 'sentiment')
best_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 50, 100)           5000000   
_________________________________________________________________
spatial_dropout1d_7 (Spatial (None, 50, 100)           0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_15 (Dense)             (None, 50)                5050      
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 51        
Total params: 5,085,501
Trainable params: 5,085,501
Non-trainable params: 0
_________________________________________________________________


In [25]:
def get_predict_value(x_test,best_model):
    y_test=best_model.predict(x_test)
    return y_test

y_test=get_predict_value(x_test[:],best_model)
y_test.shape

(244749, 1)

In [31]:
# pushs_test['push_value2']=y_test
# pushs_test['push_value3']=pushs_test['push_value2'].apply(lambda x: int(round(x)))
# pushs_test

# pushs_train['push_value2']=pushs_train['push_value']
# pushs_train['push_value3']=pushs_train['push_value']
# pushs_train

# pushs_all_after_process = pushs_train.append(pushs_test)
# pushs_all_after_process = pushs_all_after_process.reset_index(drop=True)
# pushs_all_after_process

def get_pushs_all_after_process(pushs_train,pushs_test,y_test):
    pushs_test['push_value2']=y_test
    pushs_test['push_value3']=pushs_test['push_value2'].apply(lambda x: int(round(x)))
    pushs_train['push_value2']=pushs_train['push_value']
    pushs_train['push_value3']=pushs_train['push_value']
    pushs_all_after_process = pushs_train.append(pushs_test)
    pushs_all_after_process = pushs_all_after_process.reset_index(drop=True)
    return pushs_all_after_process
    
pushs_all_after_process=get_pushs_all_after_process(pushs_train,pushs_test,y_test)
pushs_all_after_process.tail()

Unnamed: 0,id,article_id,push,user,ip,datetime,train_or_test,push_value,content,content_cut,content_cut_join,push_value2,push_value3
834955,834925,1445086702,→,pilitiger,,2015-10-18 07:09:00.000000,test,x,下一步：親友團開始上網反80嘍～～～,"[下, 一步, ：, 親友團, 開始, 上網, 反, 80, 嘍, ～, ～, ～]",下 一步 ： 親友團 開始 上網 反 80 嘍 ～ ～ ～,0.857851,1
834956,834927,1445086702,→,mojingri,,2015-10-18 07:16:00.000000,test,x,干我屁事,"[干, 我, 屁事]",干 我 屁事,0.138016,0
834957,834949,1445086702,→,bearweb,,2015-10-18 12:27:00.000000,test,x,客就不是人？ 爛貨,"[客, 就, 不是, 人, ？, , 爛貨]",客 就 不是 人 ？ 爛貨,0.625004,1
834958,834953,1445086702,→,icespeech,,2015-10-18 13:56:00.000000,test,x,真的好感人 > <　八卦仇女酸宅不懂啦,"[真的, 好感, 人, , >, , <, , 八卦, 仇, 女酸宅, 不, 懂, 啦]",真的 好感 人 > < 八卦 仇 女酸宅 不 懂 啦,0.850088,1
834959,834957,1445086702,→,omolando,,2015-10-18 16:36:00.000000,test,x,為了綠卡敢這樣玩,"[為, 了, 綠卡, 敢, 這樣, 玩]",為 了 綠卡 敢 這樣 玩,0.850335,1


In [42]:
# check performance
c1=pushs_all_after_process.push_value3==0
c2=pushs_all_after_process.train_or_test=='test'
pushs_all_after_process[c1&c2][['user','content','push_value','push_value3']].head()

Unnamed: 0,user,content,push_value,push_value3
590222,white930,台灣人 真好騙,x,0
590226,c780412,洨咬腦,x,0
590230,SiaoHan,那種咖洨才會關心這個問題,x,0
590236,r13974682,禁政治臉書哪能轉,x,0
590240,haofutw,丟臉,x,0


In [29]:
articles=get_table(db_name='ptt_gossiping.db',table_name='articles')
articles.head(2)

Unnamed: 0,id,push,title,href,author,board,ip,date
0,1374407029,爆,[爆卦] 九把刀臉書:尋找五年前蔡學良死亡現場的,www.ptt.cc/bbs/Gossiping/M.1374407029.A.D7A.html,clothg51804,Gossiping,,2019-07-21
1,1377010051,X4,[爆卦] 公布宜蘭縣長臉書,www.ptt.cc/bbs/Gossiping/M.1377010051.A.B0B.html,suewen,Gossiping,,2019-08-20


In [30]:
# pushs_and_articles=pd.merge(left=pushs_all_after_process,right=articles,how='left',left_on='article_id',right_on='id')
# pushs_and_articles=pushs_and_articles.rename(columns={'id_x':'push_id',
#                                                       'push_x':'is_push',
#                                                       'push_y':'push_num',
#                                                       'date':'article_dt',
#                                                       'datetime':'push_dt',
#                                                       'push_value3':'label',
#                                                       'author':'article_author',
#                                                       'user':'push_author',})

# columns=['push_id', 'article_id', 'is_push', 'push_author', 'push_dt',
#        'train_or_test', 'content', 'label', 'push_num',
#        'title', 'href', 'article_author', 'article_dt']

# pushs_and_articles=pushs_and_articles[columns]

# pushs_and_articles.head(2)

def get_pushs_and_articles(pushs_all_after_process,articles):
    pushs_and_articles=pd.merge(left=pushs_all_after_process,right=articles,how='left',left_on='article_id',right_on='id')
    pushs_and_articles=pushs_and_articles.rename(columns={'id_x':'push_id',
                                                          'push_x':'is_push',
                                                          'push_y':'push_num',
                                                          'date':'article_dt',
                                                          'datetime':'push_dt',
                                                          'push_value3':'label',
                                                          'author':'article_author',
                                                          'user':'push_author',})

    columns=['push_id', 'is_push', 'push_author', 'push_dt',
           'train_or_test', 'content', 'label', 'article_id', 'push_num',
           'title', 'href', 'article_author', 'article_dt']

    pushs_and_articles=pushs_and_articles[columns]
    
    return pushs_and_articles

pushs_and_articles=get_pushs_and_articles(pushs_all_after_process,articles)
pushs_and_articles.head(2)

Unnamed: 0,push_id,is_push,push_author,push_dt,train_or_test,content,label,article_id,push_num,title,href,article_author,article_dt
0,4,推,ebod221,2019-07-01 16:35:00.000000,train,北七,1,1561970056,42,[新聞] 臉書見「國中生外流」爽喊+1 人夫手機遭,www.ptt.cc/bbs/Gossiping/M.1561970056.A.18A.html,peterlin495,2019-07-01
1,5,推,yaritai,2019-07-01 16:35:00.000000,train,+1,1,1561970056,42,[新聞] 臉書見「國中生外流」爽喊+1 人夫手機遭,www.ptt.cc/bbs/Gossiping/M.1561970056.A.18A.html,peterlin495,2019-07-01


In [43]:
pushs_and_articles.shape

(834960, 13)

In [39]:
path=os.path.join('.','dataset','gossiping.pickle')
pushs_and_articles.to_pickle(path)