In [1]:
# coding: utf-8

# # 将class作为文档 进行LDA

# In[ ]:


if __name__ == '__main__':
    from tqdm import tqdm
    import json

    # In[ ]:

    import re
    import numpy as np
    import pandas as pd
    from pprint import pprint

    # Gensim
    import gensim
    import gensim.corpora as corpora
    from gensim.utils import simple_preprocess
    from lda_model_modify import modify_lda_inference

    modify_lda_inference()
    from gensim.models import CoherenceModel

    # spacy for lemmatization
    import spacy

    # Plotting tools
    import pyLDAvis
    import pyLDAvis.gensim  # don't skip this
    import matplotlib.pyplot as plt
    import matplotlib

    matplotlib.use('TkAgg')

    #     plt.switch_backend('agg')

    # get_ipython().run_line_magic('matplotlib', 'inline')

    # Enable logging for gensim - optional
    import logging

    logging.basicConfig(
        level=logging.DEBUG,
        filename='topic.log',
        filemode='a'
    )

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

    import warnings

    warnings.filterwarnings("ignore", category=DeprecationWarning)

    # # 代码相关预处理
    #
    # 根据官方文档扩展代码关键词
    # https://docs.oracle.com/javase/tutorial/java/nutsandbolts/_keywords.html
    #
    # ```json
    # ["abstract","continue","for","new","switch","assert","default","goto","package","synchronized","boolean","do","if","private","this","break","double","implements","protected","throw","byte","else","import","public","throws","case","enum","instanceof","return","transient","catch","extends","int","short","try","char","final","interface","static","void","class","finally","long","strictfp","volatile","const","float","native","super","while"]
    # ```

    # In[ ]:

    stop_words = []
    code_keywords = ["abstract", "continue", "for", "new", "switch", "assert", "default", "goto", "package",
                     "synchronized", "boolean", "do", "if", "private", "this", "break", "double", "implements",
                     "protected", "throw", "byte", "else", "import", "public", "throws", "case", "enum", "instanceof",
                     "return", "transient", "catch", "extends", "int", "short", "try", "char", "final", "interface",
                     "static", "void", "class", "finally", "long", "strictfp", "volatile", "const", "float", "native",
                     "super", "while"]
    stop_words.extend(code_keywords)

    # 加载类数据
    # path_to_token_train= "./total_data/processed_data/tokenized_classes_train.token"
    # path_to_token_train = "rawcode_comment.token"
    path_to_token_train = "C://Users/Thinkpad/Desktop/tokenresult.txt"


    def load_token_data(path_to_token_train):
        data = []
        with open(path_to_token_train, "r") as file:
            lines = file.readlines()
            for line in tqdm(lines):
                # item = json.loads(line)
                # vocabinfo = line.split("+")
                # print(vocabinfo[0])

                data.append(line)
        return data


    data = load_token_data(path_to_token_train)


    # 这里用到了gensim.utils.simple_preprocess,会自动将太长和太短的词过滤掉, 比如x,y, 比如0,1

    # In[ ]:

    def sent_to_words(sentences):
        for sentence in tqdm(sentences):
            yield (gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations


    data_words = list(sent_to_words(data))
    print(data_words[:1])


    # In[ ]:
    def remove_stopwords(texts):
        return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in tqdm(texts)]


    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)

    # In[ ]:

    # Create Dictionary
    id2word = corpora.Dictionary(data_words_nostops)

    # Create Corpus
    texts = data_words_nostops

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    # View
    print(corpus[:1])
    print(len(corpus))

    # serialize the corpus
    corpora.MmCorpus.serialize('lda_classes_token_train.mm', corpus)

    # and reload it!
    corpus = gensim.corpora.MmCorpus("lda_classes_token_train.mm")


    # In[ ]:

    def train_lda_model(dictionary, corpus, texts, limit, start=2, step=3, epoch=5):
        """
        Compute c_v coherence for various number of topics
        Parameters:
        ----------
        dictionary : Gensim dictionary
        corpus : Gensim corpus
        texts : List of input texts
        limit : Max num of topics

        Returns:
        -------
        best_model : Best of LDA topic models
        coherence_values : Coherence values corresponding to the LDA model with respective number of topics
        """
        coherence_values = []
        best_value = 0.0
        for num_topics in tqdm(range(start, limit, step)):
            # - emm, do not use multicore...
            #         model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,id2word=id2word,num_topics=num_topics,
            #                                                         random_state=100,chunksize=100,passes=2,
            #                                                         per_word_topics=True,workers=2)
            model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                    id2word=id2word,
                                                    num_topics=num_topics,
                                                    random_state=100,
                                                    update_every=1,
                                                    chunksize=1000,
                                                    passes=epoch,
                                                    iterations=2,
                                                    alpha='auto',
                                                    per_word_topics=True)
            coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
            coherence_value = coherencemodel.get_coherence()
            print("k=" + str(num_topics) + ", coherence:" + str(coherence_value))
            if coherence_value > best_value:
                print("best model: k=" + str(num_topics))
                best_value = coherence_value
                best_model = model
            model.save('lda_models_' + str(num_topics) + '_epoch=' + str(
                epoch) + '_chunk=' + str(1000) + '.model')
            coherence_values.append(coherence_value)
        return best_model, coherence_values


    # In[ ]:

    limit = 13
    start = 12
    step = 1
    epoch = 5

    # 训练并获得最优评估值模型
    lda_model, coherence_values = train_lda_model(dictionary=id2word, corpus=corpus, texts=data_words_nostops,
                                                  start=start, limit=limit, step=step, epoch=epoch)

    print(coherence_values)

    # # # In[ ]:

    # # Show graph
    # x = range(start, limit, step)
    # plt.plot(x, coherence_values)
    # plt.xlabel("Num Topics")
    # plt.ylabel("Coherence score")
    # plt.legend(("coherence_values"), loc='best')
    # plt.show()
    # plt.savefig('topicnum_coherence.png', bbox_inches='tight')
    # Visualize the topics
    # pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    pyLDAvis.save_html(vis, 'lda.html')
    # pyLDAvis.show(vis)


100%|████████████████████████████████████████████████████████████████████████| 21832/21832 [00:00<00:00, 949571.67it/s]
100%|██████████████████████████████████████████████████████████████████████████| 21832/21832 [00:02<00:00, 9160.59it/s]
  3%|██▎                                                                         | 678/21832 [00:00<00:03, 6581.43it/s]

[['encode', 'data', 'capacity', 'length', 'string', 'builder', 'bit', 'index', 'bits', 'extract', 'append']]


100%|█████████████████████████████████████████████████████████████████████████| 21832/21832 [00:02<00:00, 10685.53it/s]


[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)]]
21832


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:25<00:00, 25.66s/it]

k=12, coherence:0.5851921175620755
best model: k=12
[0.5851921175620755]



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [5]:
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    pyLDAvis.save_html(vis, 'lda.html')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [6]:
    pyLDAvis.show(vis)


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [05/Dec/2019 10:51:06] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [05/Dec/2019 10:51:06] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [05/Dec/2019 10:51:06] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [05/Dec/2019 10:51:06] "GET /LDAvis.js HTTP/1.1" 200 -



stopping Server...


In [7]:
for top in lda_model.print_topics(10):
  print (top)

SyntaxError: Missing parentheses in call to 'print'. Did you mean print(top)? (<ipython-input-7-76d9e2622195>, line 2)