In [31]:
import matplotlib.pyplot as plt 
from nltk.text import TextCollection
from nltk.tokenize import word_tokenize
import json
import nltk
import csv
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
from nltk.corpus import wordnet
from nltk import pos_tag

nltk.download('wordnet')
%matplotlib inline

[nltk_data] Downloading package wordnet to /Users/dingyi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
news_data = []
with open('output/cleaned_data/cleaned_data_2022_07.json', 'r', encoding='utf-8') as file:
    news_list = json.load(file)
    for news_json in news_list:
        text = news_json['title'] + '.' + news_json['text']
        news_data.append(text)

In [25]:
words=[word_tokenize(data) for data in news_data] #对每个句子进行分词
corpus=TextCollection(words) 
words

[['France',
  'Closes',
  "'Cookies",
  "'",
  'Case',
  'Against',
  'Facebook',
  '-',
  'SecurityWeek.Hi',
  'what',
  'are',
  'you',
  'looking',
  'forFrench',
  'privacy',
  'regulators',
  'on',
  'Thursday',
  'closed',
  'a',
  'case',
  'against',
  'Facebook',
  'after',
  'determining',
  'the',
  'US',
  'tech',
  'giant',
  'had',
  'changed',
  'the',
  'way',
  'it',
  'collected',
  'user',
  'data',
  'to',
  'comply',
  'with',
  'the',
  'lawByFlipboardRedditWhatsappWhatsappEmailAFP',
  '2023Subscribe',
  'to',
  'the',
  'SecurityWeek',
  'Email',
  'Briefing',
  'to',
  'stay',
  'informed',
  'on',
  'the',
  'latest',
  'threats',
  'trends',
  'and',
  'technology',
  'along',
  'with',
  'insightful',
  'columns',
  'from',
  'industry',
  'expertsJoin',
  'us',
  'as',
  'we',
  'delve',
  'into',
  'the',
  'transformative',
  'potential',
  'of',
  'AI',
  'predictive',
  'ChatGPT-like',
  'tools',
  'and',
  'automation',
  'to',
  'detect',
  'and',
  'd

In [27]:
def tokenize(text):
    token_list = nltk.word_tokenize(text)
    # tagged_words = pos_tag(token_list)
    return token_list


def stemmer_words(word):
    stemmer = PorterStemmer()
    # 对每个词进行词干还原
    stemmed_word = stemmer.stem(word)
    return stemmed_word


def lemmatizer_word(word_list):
    tagged_words = pos_tag(word_list)
    tagged_words = clean_by_pos(tagged_words)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = []
    for tag in tagged_words:
        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
        lemmatized_words.append(lemmatizer.lemmatize(tag[0], pos=wordnet_pos))
    return lemmatized_words


# 根据词性清理词汇
def clean_by_pos(tagged_words):
    cleaned_tagged_word = []
    include_tag = ["EX", "JJ", "JJR", "JJS", "NN", "NNS", "NNP", "NNPS", "PDT", "RB", "RBR", "RBS", "UH", "VB", "VBD",
                   "VBN", "VBP", "VBZ", "NP", "PP", "VP", "ADJP", "ADVP", "PNP", "-SBJ", "-OBJ"]
    for tagged_word in tagged_words:
        if tagged_word[1] in include_tag:
            cleaned_tagged_word.append(tagged_word)
    return cleaned_tagged_word


# 获取单词的词性
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None


def load_security_words():
    nltk.download('stopwords')
    stopwords_security_list = stopwords.words('english')
    with open("other_file/glasgow_stop_words.txt", "r") as f:
        stopwords_list = f.read().split('\n')
        stopwords_security_list.extend(stopwords_list)
    with open("other_file/info_security_stopwords.txt", "r") as s:
        info_security_list = s.read().split(', ')
        stopwords_security_list.extend(info_security_list)
    stopwords_security = set(stopwords_security_list)
    return stopwords_security


def remove_stopwords(stopwords_security, word_list):
    cleaned_list = []
    # 利用词干分析过滤掉停用词
    stem_stopwords = [stemmer_words(word) for word in stopwords_security]
    # 词形还原
    word_list = lemmatizer_word(word_list)
    for word in word_list:
        if stemmer_words(word.lower()) not in stem_stopwords:
            cleaned_list.append(word)
    return cleaned_list


# load stopwords
def load_data(year, start_month, end_month):
    count = start_month
    news_data = []
    while count <= end_month:
        with open('output/cleaned_data/cleaned_data_' + year + '_' + format_month(count) + '.json', 'r', encoding='utf-8') as file:
            news_list = json.load(file)
            for news_json in news_list:
                text = news_json['title'] + '.' + news_json['text']
                news_data.append(text)
        count = count + 1
    print("Total amount of data is " + str(len(news_data)))
    return news_data


def format_month(num):
    formatted_str = str(num).zfill(2)
    return formatted_str


def write_file(keyword_counter, year, start_month, end_month):
    word_dict = dict(keyword_counter)
    sorted_data = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)
    # 只输出前120个高频词
    with open('output/output_nltk_' + year + '_' + format_month(start_month)
              + '_' + format_month(end_month) + '.json', 'w', encoding='utf-8') as f:
        json.dump(sorted_data[:200], f, indent=4, ensure_ascii=False)


if __name__ == "__main__":

    year = '2023'
    start_month = 1
    end_month = 2

    data = load_data(year, start_month, end_month)
    keyword_list = []
    stopword_list = load_security_words()
    sent = []

    print("start nlkt process")
    for i, content in enumerate(data):
        word_list = tokenize(content)
        word_list_cleaned = remove_stopwords(stopword_list, word_list)
        sent.append(word_list_cleaned)
        keyword_list.extend(word_list_cleaned)
    keyword_counter = Counter(keyword_list)

Total amount of data is 448
start nlkt process


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dingyi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
corpus=TextCollection(sent)
corpus

<Text: Porsche halt NFT launch site void.Porsche cut minting...>

In [36]:
word_dict = dict(keyword_counter)
sorted_data = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)
# 创建CSV文件并写入数据
with open('output/output_nltk1.csv', 'w', newline='') as csv_file:
    writer = csv.writer(csv_file)

    # 写入CSV文件的表头
    writer.writerow(['word', 'freq'])

    # 写入JSON数据
    for item in sorted_data:
        writer.writerow([item[0], corpus.tf_idf(item[0],corpus)])