In [11]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora, models
import matplotlib.pyplot as plt
from nltk.text import TextCollection
from nltk.tokenize import word_tokenize
import json
import nltk
import csv
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
from nltk.corpus import wordnet
from nltk import pos_tag

nltk.download('wordnet')
%matplotlib inline

[nltk_data] Downloading package wordnet to /Users/dingyi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
def tokenize(text):
    token_list = nltk.word_tokenize(text)
    # tagged_words = pos_tag(token_list)
    return token_list


def stemmer_words(word):
    stemmer = PorterStemmer()
    # 对每个词进行词干还原
    stemmed_word = stemmer.stem(word)
    return stemmed_word


def lemmatizer_word(word_list):
    tagged_words = pos_tag(word_list)
    tagged_words = clean_by_pos(tagged_words)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = []
    for tag in tagged_words:
        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
        lemmatized_words.append(lemmatizer.lemmatize(tag[0], pos=wordnet_pos))
    return lemmatized_words


# 根据词性清理词汇
def clean_by_pos(tagged_words):
    cleaned_tagged_word = []
    include_tag = ["NN", "NNS", "NNP", "NNPS", "NP"]
    for tagged_word in tagged_words:
        if tagged_word[1] in include_tag:
            cleaned_tagged_word.append(tagged_word)
    return cleaned_tagged_word


# 获取单词的词性
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None


def load_security_words():
    nltk.download('stopwords')
    stopwords_security_list = stopwords.words('english')
    with open("other_file/glasgow_stop_words.txt", "r") as f:
        stopwords_list = f.read().split('\n')
        stopwords_security_list.extend(stopwords_list)
    with open("other_file/info_security_stopwords.txt", "r") as s:
        info_security_list = s.read().split('\n')
        stopwords_security_list.extend(info_security_list)
    stopwords_security = set(stopwords_security_list)
    return stopwords_security


def remove_stopwords(stopwords_security, word_list):
    cleaned_list = []
    # 利用词干分析过滤掉停用词
    stem_stopwords = [stemmer_words(word) for word in stopwords_security]
    # 词形还原
    word_list = lemmatizer_word(word_list)
    for word in word_list:
        if stemmer_words(word.lower()) not in stem_stopwords:
            cleaned_list.append(word)
    return cleaned_list


# load stopwords
def load_data(year, start_month, end_month):
    count = start_month
    news_data = []
    while count <= end_month:
        with open('output/cleaned_data/cleaned_data_' + year + '_' + format_month(count) + '.json', 'r', encoding='utf-8') as file:
            news_list = json.load(file)
            for news_json in news_list:
                text = news_json['title'] + '.' + news_json['text']
                news_data.append(text.lower())
        count = count + 1
    print("Total amount of data is " + str(len(news_data)))
    return news_data


def format_month(num):
    formatted_str = str(num).zfill(2)
    return formatted_str


def write_file(keyword_counter, year, start_month, end_month):
    word_dict = dict(keyword_counter)
    sorted_data = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)
    # 只输出前120个高频词
    with open('output/nltk_output/output_nltk_' + year + '_' + format_month(start_month)
              + '_' + format_month(end_month) + '_' + 'nation' + '.json', 'w', encoding='utf-8') as f:
        json.dump(sorted_data, f, indent=4, ensure_ascii=False)


In [13]:
year = '2023'
start_month = 4
end_month = 6

data = load_data(year, start_month, end_month)
keyword_list = []
stopword_list = load_security_words()
sent=[]

print("start nlkt process")
for i, content in enumerate(data):
    word_list = tokenize(content)
    word_list_cleaned = remove_stopwords(stopword_list, word_list)
    sent.append(word_list_cleaned)
    keyword_list.extend(word_list_cleaned)

# year = '2023'
# start_month = 1
# end_month = 11
# data = load_data(year, start_month, end_month)
# for i, content in enumerate(data):
#     word_list = tokenize(content)
#     word_list_cleaned = remove_stopwords(stopword_list, word_list)
#     sent.append(word_list_cleaned)
#     keyword_list.extend(word_list_cleaned)
    
keyword_counter = Counter(keyword_list)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dingyi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Total amount of data is 646
start nlkt process


In [14]:
# 构建词典
dictionary = corpora.Dictionary(sent)

# 将文本转换为词袋表示
bow_corpus = [dictionary.doc2bow(doc) for doc in sent]

# 训练LDA模型
num_topics = 5  # 设置主题数
lda_model = models.LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=10)

# 输出每个主题的关键词
for topic_id in range(num_topics):
    print(f"Topic #{topic_id + 1}:")
    topic_keywords = lda_model.show_topic(topic_id, topn=2)
    keywords = [word for word, _ in topic_keywords]
    print(keywords)

Topic #1:
['ransomware', 'malware']
Topic #2:
['malware', 'ransomware']
Topic #3:
['email', 'malware']
Topic #4:
['malware', 'website']
Topic #5:
['snake', 'ransomware']


In [15]:
# from gensim.models import Word2Vec
# import matplotlib.pyplot as plt

# # 准备文本数据
# texts = [
#     "The COVID-19 pandemic has caused global disruptions.",
#     "Efforts are being made to develop effective vaccines.",
#     "Lockdown measures have been implemented to control the spread.",
#     "Economic recovery is a major concern amidst the crisis.",
#     "New variants of the virus are being closely monitored.",
#     "Vaccination campaigns are being rolled out worldwide."
# ]

# # 数据预处理（分词等）
# corpus = [text.lower().split() for text in texts]

# # 训练Word2Vec模型
# model = Word2Vec(corpus, vector_size=100, window=5, min_count=1)

# # 获取与给定词语相关的热点词汇
# target_word = "pandemic"
# similar_words = model.wv.most_similar(target_word)

# # 打印热点词汇
# print(f"Words related to '{target_word}':")
# for word, similarity in similar_words:
#     print(f"{word}: {similarity:.4f}")

# # 绘制词向量变化图
# words_to_plot = [target_word] + [word for word, _ in similar_words]
# vectors = [model.wv[word] for word in words_to_plot]

# plt.figure(figsize=(10, 6))
# for i, word in enumerate(words_to_plot):
#     plt.scatter(vectors[i][0], vectors[i][1])
#     plt.annotate(word, (vectors[i][0], vectors[i][1]))

# plt.xlabel("Dimension 1")
# plt.ylabel("Dimension 2")
# plt.title("Word Embeddings")
# plt.grid(True)
# plt.show()