In [3]:
import time


def get_numbers_list(n):
    # 使用列表（一次性生成所有数据）
    return [x for x in range(n)]  # 立即生成所有数字


def get_numbers_generator(n):
    # 使用yield（按需生成数据）
    for x in range(n):
        yield x  # 每次只生成一个数字


# 使用对比
start_time = time.time()
numbers_list = get_numbers_list(1000000)      # 立即占用大量内存
end_time = time.time()
print(f"列表生成时间: {end_time - start_time} 秒")

start_time = time.time()
numbers_gen = get_numbers_generator(1000000)   # 几乎不占内存，直到使用时才生成
end_time = time.time()
print(f"生成器生成时间: {end_time - start_time} 秒")

start_time = time.time()
for num in numbers_gen:
    pass
    break
end_time = time.time()
print(f"生成器使用时间: {end_time - start_time} 秒")

列表生成时间: 0.020906925201416016 秒
生成器生成时间: 2.7894973754882812e-05 秒
生成器使用时间: 3.314018249511719e-05 秒


In [3]:
from collections import UserDict


class MyDict(UserDict):
    # 重写 __setitem__ 方法
    def __missing__(self, key):
        # 如果 key 不存在，返回默认值 0
        return 0


a = MyDict()
a['count'] += 1

a

{'count': 1}

In [11]:
from collections import Counter

counter = Counter(['a', 'b', 'c', 'a'])
counter.update(['a', 'b', 'c', 'a', 'c', 'c', 'c'])
# counter.clear()
print(counter)
counter.most_common()

3


[('c', 5), ('a', 4), ('b', 2)]

In [2]:
a = [1, 2, 3]
b = [4, 5, 6]

a.extend(b)
a

[1, 2, 3, 4, 5, 6]

In [6]:
import os
import pandas as pd

file_path = 'test.txt'
with open(file_path, 'r') as f:
    for line in f:
        word, freq = line.rsplit(' ', 1)
        print('word:', word, 'freq:', freq)

word: first freq: 1

word: second freq: 2

word:   freq: 3


In [3]:
from typing import TypedDict


class NewsItem(TypedDict):
    index: int
    title: str
    content: str


a = NewsItem(index=1, title='title', content='content')

In [6]:
import numpy as np
import math


def word_idf(word_in_doc_count: int, total_doc_count: int):
    """ 计算 idf 值 (加 1 平滑, 最小值 1) """
    idf = math.log((total_doc_count + 1) /
                   (word_in_doc_count + 1)) + 1
    return idf


print((word_idf(1, 2)))

1.4054651081081644
