In [1]:
import pandas as pd
import nltk
from nltk.util import bigrams, trigrams
from nltk.corpus import stopwords
import unicodedata
import re

In [2]:
def clean(text):
    wnl = nltk.stem.WordNetLemmatizer()
    # 加载英文停用词集合
    stop_words = set(stopwords.words('english'))

    # 要添加的额外词语列表
    # extra_stop_words = ['guardian', 'news', 'guardian.com', 'say', 'said', 'article']
    extra_stop_words = [
        'guardian', 'article', 'theguardian', 'com', 'says', 'said', 'just', 'like', 'can', 'one', 'also',
        'year', 'years', 'time', 'times', 'world', 'make', 'makes', 'made', 'know', 'known', 'go','south',
        'going', 'get', 'getting', 'got', 'see', 'seeing', 'seen', 'may', 'might', 'week', 'weeks', 'month', 'months',
        'use', 'used', 'using', 'think', 'thinks', 'thought', 'take', 'takes', 'took', 'come', 'comes', 'came',
        'way', 'ways', 'many', 'much', 'news', 'report', 'including', 'use', 'good', 'bad', 'look', 'looks',
        'looking', 'help', 'want', 'wants', 'wanted', 'need', 'needs', 'needed', 'important', 'lot', 'lots', 'tell',
        'tells', 'told', 'work', 'works', 'worked', 'place', 'places', 'point', 'points', 'number', 'numbers',
        'group', 'groups', 'man', 'men', 'woman', 'women', 'child', 'children', 'company', 'companies', 'zealand',
        'york'
    ]

    stop_words.update(extra_stop_words)

    # 将额外词语添加到停用词集合中
    stop_words.update(extra_stop_words)
    # stop_words = set(stopwords.words('english')) #+ ['guardian', 'news', 'theguardian.com', 'say', 'said', 'article', '·', '-']
    text = (unicodedata.normalize('NFKD', text)
            .encode('ascii', 'ignore')
            .decode('utf-8', 'ignore')
            .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stop_words]

In [5]:
# 修改filter_and_save_data函数，返回过滤后的DataFrame
def filter_data(df, start_date, end_date, section_names):
    df['webPublicationDate'] = pd.to_datetime(df['webPublicationDate'])
    return df[(df['webPublicationDate'] >= start_date) & (df['webPublicationDate'] <= end_date) & (df['sectionName'].isin(section_names))]


In [7]:
section_names = ['Australia news']
start_date = '2020-01-01'
end_date = '2022-03-31'

In [8]:
data = pd.read_csv('/Users/neo/Documents/bert/bert_final2.csv')
filtered_data = filter_data(data, start_date, end_date, section_names)

In [10]:
all_bigrams = []
all_trigrams = []
for text in filtered_data['Tweet']:
    # 确保text是字符串类型
    cleaned_text = clean(str(text))
    all_bigrams.extend(list(bigrams(cleaned_text)))
    all_trigrams.extend(list(trigrams(cleaned_text)))
# 分析bigrams和trigrams
print("Bigrams:", all_bigrams[:10]) # 打印前10个bigrams作为示例
print("Trigrams:", all_trigrams[:10]) # 打印前10个trigrams作为示例

Bigrams: [('nope', 'banking'), ('banking', 'royal'), ('royal', 'commissioner'), ('commissioner', 'kenneth'), ('kenneth', 'haynes'), ('haynes', 'response'), ('response', 'asked'), ('asked', 'shake'), ('shake', 'hand'), ('hand', 'treasurer')]
Trigrams: [('nope', 'banking', 'royal'), ('banking', 'royal', 'commissioner'), ('royal', 'commissioner', 'kenneth'), ('commissioner', 'kenneth', 'haynes'), ('kenneth', 'haynes', 'response'), ('haynes', 'response', 'asked'), ('response', 'asked', 'shake'), ('asked', 'shake', 'hand'), ('shake', 'hand', 'treasurer'), ('hand', 'treasurer', 'josh')]
