Connected to base (Python 3.12.7)

In [None]:
import os
from PIL import Image
import jieba

import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_gradient_magnitude

from wordcloud import WordCloud, ImageColorGenerator

def jieba_processing_txt(text, stopwords_path=None, userdict_list=None):
    if userdict_list:
        for word in userdict_list:
            jieba.add_word(word)

    seg_list = jieba.cut(text, cut_all=False)
    words = [w.strip() for w in seg_list if len(w.strip()) > 1]

    if stopwords_path:
        with open(stopwords_path, encoding='utf-8') as f:
            stopwords = set(f.read().splitlines())
        words = [w for w in words if w not in stopwords]

    return ' '.join(words)

# get data directory (using getcwd() is needed to support running example in generated IPython notebook)
d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()

font_path = os.path.join(d, "fonts/SourceHanSerif/SourceHanSerifK-Light.otf")
text_path = os.path.join(d, "wc_cn/CalltoArms.txt")
stopwords_path = os.path.join(d, "wc_cn/stopwords_cn_en.txt")
userdict_list = ['阿Ｑ', '孔乙己', '言子书院']
text = open(text_path, encoding="utf-8").read()
processed_text = jieba_processing_txt(text, stopwords_path=stopwords_path, userdict_list=userdict_list)

# load image. This has been modified in gimp to be brighter and have more saturation.
parrot_color = np.array(Image.open(os.path.join(d, "alice_color.png")))
# subsample by factor of 3. Very lossy but for a wordcloud we don't really care.
# parrot_color = parrot_color[::3, ::3]

# create mask  white is "masked out"
parrot_mask = parrot_color.copy()
parrot_mask[parrot_mask.sum(axis=2) == 0] = 255

# some finesse: we enforce boundaries between colors so they get less washed out.
# For that we do some edge detection in the image
edges = np.mean([gaussian_gradient_magnitude(parrot_color[:, :, i] / 255., 2) for i in range(3)], axis=0)
parrot_mask[edges > .08] = 255

# create wordcloud. A bit sluggish, you can subsample more strongly for quicker rendering
# relative_scaling=0 means the frequencies in the data are reflected less
# acurately but it makes a better picture
wc = WordCloud(
    font_path=font_path,
    max_words=5000,
    mask=parrot_mask,
    max_font_size=50,
    min_font_size=2,
    font_step=1,
    scale=2,
    prefer_horizontal=1.0,
    relative_scaling=0.4,
    random_state=42,
    background_color="white"
)

# generate word cloud
wc.generate(processed_text)
plt.imshow(wc)

# create coloring from image
image_colors = ImageColorGenerator(parrot_color)
wc.recolor(color_func=image_colors)
plt.figure(figsize=(10, 10))
plt.imshow(wc, interpolation="bilinear")
wc.to_file("parrot_new.png")

plt.figure(figsize=(10, 10))
plt.title("Original Image")
plt.imshow(parrot_color)

plt.figure(figsize=(10, 10))
plt.title("Edge map")
plt.imshow(edges)
plt.show()