**11주차 과제 - 워드 클라우드**

In [25]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import movie_reviews

from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings('ignore')

In [None]:
# 영화 리뷰 로드
nltk.download("movie_reviews")

# 긍정/부정 리뷰를 리스트로 각각 저장
positive_reviews = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids('pos')]
negative_reviews = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids('neg')]

# 긍정/부정 리뷰를 하나의 DataFrame으로 저장
df = pd.concat([pd.DataFrame(positive_reviews), pd.DataFrame(negative_reviews)])

# 컬럼명을 'review'로 변경
df = df.rename(columns={0: "review"})

In [None]:
df


In [28]:
# 가장 많이 등장하는 n-gram을 추출하는 함수
def get_top_ngrams(corpus, ngram_range, stop_words=None, n=None):
    vec = CountVectorizer(stop_words=stop_words, ngram_range=ngram_range).fit(corpus)
    bag_of_words = vec.transform(corpus)

    sum_words = bag_of_words.sum(axis=0)

    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)

    common_words = words_freq[:n]
    words = []
    freqs = []
    for word, freq in common_words:
        words.append(word)
        freqs.append(freq)

    df = pd.DataFrame({'Word': words, 'Freq': freqs})
    return df

In [29]:
# 가장 많이 등장하는 n-gram
stop_words = 'english'
n = 20
unigrams_st = get_top_ngrams(df['review'], (1, 1), stop_words, n)

In [30]:
from wordcloud import WordCloud

# 워드 클라우드 함수를 사용하기 위해 빈도 DataFrame을 dictionary로 변환
unigrams_st_dict = {word: freq for word, freq in zip(unigrams_st['Word'], unigrams_st['Freq'])}

In [None]:
unigrams_st_dict

In [None]:
# background_color: 배경색, colormap: 글자색
wordcloud_uni = WordCloud(background_color="white",
                          colormap="twilight_shifted").generate_from_frequencies(unigrams_st_dict)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_uni, interpolation='bilinear')
plt.axis("off")
plt.title("Word Cloud Example")
plt.show()