### 라이브러리

In [1]:
import sys
import nltk
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import gensim
import pyLDAvis.gensim_models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

  from imp import reload


### 트윗 텍스트 데이터 추가 전처리 함수

In [2]:
def extra_preprocess_text(df):
    # 특수문자 제거
    df['tweet'] = df['tweet'].str.replace('[^a-zA-Z]', ' ')
    # 2글자 이하 제거
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w) >= 3]))
    # 소문자로 변환
    df['tweet'] = df['tweet'].apply(lambda x: x.lower())
    
    # 불용어 처리 및 토큰화
    stop_words = nltk.corpus.stopwords.words('english')
    tokenized_tweet = df['tweet'].apply(lambda x: x.split())
    tokenized_tweet = tokenized_tweet.apply(lambda x: [item for item in x if item not in stop_words])

    # 역토큰화
    detokenized_tweet = []
    for i in range(len(df)):
        t = ' '.join(tokenized_tweet[i])
        detokenized_tweet.append(t)
    df['tweet'] = detokenized_tweet
    
    return df, tokenized_tweet, detokenized_tweet

### 토픽 모델링 클래스

In [3]:
# LSA를 이용한 토픽 모델링 클래스
class LSATopicMaker():
    def __init__(self, df):
        self.df, self.tokenized_tweet, self.detokenized_tweet = extra_preprocess_text(df)
        self.svd, self.vectorizer = self.utilize()
        
    def utilize(self):
        # tf-idf 생성
        vectorizer = TfidfVectorizer(stop_words='english',
                                     max_features=1000,
                                     max_df=0.5,
                                     smooth_idf=True)
        X = vectorizer.fit_transform(self.df['tweet'])
        #print(f'TF-IDF 행렬 크기: {X.shape}')
        
        # svd 생성
        svd = TruncatedSVD(n_components=10,
                           algorithm='randomized',
                           n_iter=100,
                           random_state=0)
        svd.fit(X)
        return svd, vectorizer
        
    def make_topics(self, filename, printoption=False):
        # svd로 토픽 추출
        n = 10
        svd = self.svd
        components = svd.components_
        feature_names = self.vectorizer.get_feature_names()
        
        # 예시 출력
        if printoption:
            for idx, topic in enumerate(components):
                print(f'Topic {idx+1}:',
                      [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n-1:-1]])
                print('\n')
        
        # 텍스트 파일로 결과 저장
        temp = sys.stdout
        with open(f'./topic_modeling/LSA_queen_{filename}.txt', 'w') as sys.stdout:
            for idx, topic in enumerate(components):
                print(f'Topic {idx+1}:',
                      [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n-1:-1]])
                print('\n')
            sys.stdout = temp

In [4]:
# LDA를 이용한 토픽 모델링 클래스
class LDATopicMaker():
    def __init__(self, df):
        self.df, self.tokenized_tweet, self.detokenized_tweet = extra_preprocess_text(df)
        self.lda, self.corpus, self.dic = self.utilize()
        
    def utilize(self):
        # 말뭉치 단어 사전 생성
        dic = gensim.corpora.Dictionary(self.tokenized_tweet)
        corpus = [dic.doc2bow(text) for text in self.tokenized_tweet]
        
        # lda 생성
        lda = gensim.models.ldamodel.LdaModel(corpus,
                                              num_topics=10,
                                              id2word=dic,
                                              passes=15)
        return lda, corpus, dic
        
    def make_topics(self, filename, printoption=False):
        # lda로 토픽 추출
        lda = self.lda
        topics = lda.print_topics(num_words=10)
        
        # 예시 출력
        if printoption:
            for idx, topic in enumerate(topics):
                print(f'Topic {idx+1}:', topic[1])
                print('\n')
        
        # 텍스트 파일로 결과 저장
        temp = sys.stdout
        with open(f'./topic_modeling/LDA_queen_{filename}.txt', 'w') as sys.stdout:
            for idx, topic in enumerate(topics):
                print(f'Topic {idx+1}:', topic[1])
                print('\n')
            sys.stdout = temp
             
    def visualize(self):
        pyLDAvis.enable_notebook()
        vis = pyLDAvis.gensim_models.prepare(self.lda, self.corpus, self.dic)
        pyLDAvis.display(vis)

### 전체 데이터로 토픽 모델링

In [5]:
df = pd.read_csv('./queen_notshort.csv', lineterminator='\n', low_memory=False)

In [6]:
# LSA 토픽 모델링 클래스 생성, 예시 확인 및 저장
lsa_df = LSATopicMaker(df)
lsa_df.make_topics(filename='all', printoption=True)

Topic 1: [('majesty', 0.3736), ('family', 0.29566), ('royal', 0.27677), ('rest', 0.27481), ('peace', 0.26322), ('condolences', 0.23908), ('passing', 0.238), ('saddened', 0.20551), ('death', 0.16546), ('deeply', 0.16001)]


Topic 2: [('monarch', 0.40364), ('longest', 0.37705), ('died', 0.29978), ('reigning', 0.2957), ('britain', 0.25135), ('dies', 0.19693), ('aged', 0.16384), ('serving', 0.15996), ('years', 0.1522), ('age', 0.13069)]


Topic 3: [('rest', 0.61089), ('peace', 0.60311), ('soul', 0.06902), ('thank', 0.056), ('god', 0.05014), ('majesty', 0.03321), ('bridge', 0.03032), ('eternal', 0.0263), ('london', 0.02574), ('forever', 0.02401)]


Topic 4: [('death', 0.58678), ('king', 0.38648), ('charles', 0.3555), ('iii', 0.21007), ('following', 0.14055), ('prince', 0.10081), ('god', 0.09741), ('live', 0.08593), ('new', 0.07931), ('statement', 0.07728)]


Topic 5: [('died', 0.59863), ('palace', 0.30989), ('buckingham', 0.30564), ('announces', 0.21326), ('aged', 0.19682), ('rip', 0.19518)

In [7]:
# LDA 토픽 모델링 클래스 생성, 예시 확인 및 저장
lda_df = LDATopicMaker(df)
lda_df.make_topics(filename='all', printoption=True)

Topic 1: 0.065*"world" + 0.063*"united" + 0.056*"kingdom" + 0.034*"people" + 0.028*"around" + 0.024*"commonwealth" + 0.015*"canada" + 0.015*"across" + 0.013*"passing" + 0.010*"head"


Topic 2: 0.133*"rest" + 0.121*"peace" + 0.097*"queen" + 0.081*"elizabeth" + 0.065*"may" + 0.030*"majesty" + 0.013*"soul" + 0.009*"state" + 0.009*"half" + 0.009*"god"


Topic 3: 0.109*"queen" + 0.099*"elizabeth" + 0.044*"away" + 0.041*"passed" + 0.028*"dies" + 0.028*"death" + 0.022*"news" + 0.018*"balmoral" + 0.015*"statement" + 0.014*"age"


Topic 4: 0.090*"queen" + 0.071*"elizabeth" + 0.014*"one" + 0.013*"rip" + 0.012*"london" + 0.012*"dead" + 0.011*"like" + 0.010*"bridge" + 0.010*"breaking" + 0.009*"prime"


Topic 5: 0.021*"die" + 0.014*"est" + 0.010*"der" + 0.009*"del" + 0.009*"trump" + 0.009*"und" + 0.008*"man" + 0.008*"por" + 0.008*"von" + 0.006*"john"


Topic 6: 0.081*"queen" + 0.071*"king" + 0.067*"elizabeth" + 0.052*"charles" + 0.031*"death" + 0.020*"prince" + 0.019*"iii" + 0.019*"new" + 0.018*"li

In [8]:
# LDA 토픽 모델링 시각화
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_df.lda, lda_df.corpus, lda_df.dic)
pyLDAvis.display(vis)

### 트윗 시간순 토픽 모델링

In [9]:
hour_path = './condition/hour/queen_'

df_hour_0    = pd.read_csv(hour_path+'hour_0.csv', lineterminator='\n', encoding='utf-8-sig')
df_hour_1to2 = pd.read_csv(hour_path+'hour_1to2.csv', lineterminator='\n', encoding='utf-8-sig')
df_hour_3to5 = pd.read_csv(hour_path+'hour_3to5.csv', lineterminator='\n', encoding='utf-8-sig')
df_hour_6to9 = pd.read_csv(hour_path+'hour_6to9.csv', lineterminator='\n', encoding='utf-8-sig')

In [10]:
# LDA 토픽 모델링 클래스 생성
lda_df_hour_0    = LDATopicMaker(df_hour_0)
lda_df_hour_1to2 = LDATopicMaker(df_hour_1to2)
lda_df_hour_3to5 = LDATopicMaker(df_hour_3to5)
lda_df_hour_6to9 = LDATopicMaker(df_hour_6to9)

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [11]:
# 토픽 모델링 결과 저장
lda_df_hour_0.make_topics(filename='hour_0', printoption=False)
lda_df_hour_1to2.make_topics(filename='hour_1to2', printoption=False)
lda_df_hour_3to5.make_topics(filename='hour_3to5', printoption=False)
lda_df_hour_6to9.make_topics(filename='hour_6to9', printoption=False)

  from imp import reload


### 트윗 좋아요순 토픽 모델링

In [12]:
like_path = './condition/likes_count/queen_'

df_like_0       = pd.read_csv(like_path+'like_0.csv', lineterminator='\n', encoding='utf-8-sig')
df_like_1to10   = pd.read_csv(like_path+'like_1to10.csv', lineterminator='\n', encoding='utf-8-sig')
df_like_11to50  = pd.read_csv(like_path+'like_11to50.csv', lineterminator='\n', encoding='utf-8-sig')
df_like_51to100 = pd.read_csv(like_path+'like_51to100.csv', lineterminator='\n', encoding='utf-8-sig')
df_like_over100 = pd.read_csv(like_path+'like_over100.csv', lineterminator='\n', encoding='utf-8-sig')

In [13]:
# LDA 토픽 모델링 클래스 생성
lda_df_like_0       = LDATopicMaker(df_like_0)
lda_df_like_1to10   = LDATopicMaker(df_like_1to10)
lda_df_like_11to50  = LDATopicMaker(df_like_11to50)
lda_df_like_51to100 = LDATopicMaker(df_like_51to100)
lda_df_like_over100 = LDATopicMaker(df_like_over100)

In [14]:
# 토픽 모델링 결과 저장
lda_df_like_0.make_topics(filename='like_0', printoption=False)
lda_df_like_1to10.make_topics(filename='like_1to10', printoption=False)
lda_df_like_11to50.make_topics(filename='like_11to50', printoption=False)
lda_df_like_51to100.make_topics(filename='like_51to100', printoption=False)
lda_df_like_over100.make_topics(filename='like_over100', printoption=False)

### 트윗 답글순 토픽 모델링

In [15]:
rt_path = './condition/retweets_count/queen_'

df_rt_0       = pd.read_csv(rt_path+'rt_0.csv', lineterminator='\n', encoding='utf-8-sig')
df_rt_1to10   = pd.read_csv(rt_path+'rt_1to10.csv', lineterminator='\n', encoding='utf-8-sig')
df_rt_11to50  = pd.read_csv(rt_path+'rt_11to50.csv', lineterminator='\n', encoding='utf-8-sig')
df_rt_51to100 = pd.read_csv(rt_path+'rt_51to100.csv', lineterminator='\n', encoding='utf-8-sig')
df_rt_over100 = pd.read_csv(rt_path+'rt_over100.csv', lineterminator='\n', encoding='utf-8-sig')

In [16]:
# LDA 토픽 모델링 클래스 생성
lda_df_rt_0       = LDATopicMaker(df_rt_0)
lda_df_rt_1to10   = LDATopicMaker(df_rt_1to10)
lda_df_rt_11to50  = LDATopicMaker(df_rt_11to50)
lda_df_rt_51to100 = LDATopicMaker(df_rt_51to100)
lda_df_rt_over100 = LDATopicMaker(df_rt_over100)

In [17]:
# 토픽 모델링 결과 저장
lda_df_rt_0.make_topics(filename='rt_0', printoption=False)
lda_df_rt_1to10.make_topics(filename='rt_1to10', printoption=False)
lda_df_rt_11to50.make_topics(filename='rt_11to50', printoption=False)
lda_df_rt_51to100.make_topics(filename='rt_51to100', printoption=False)
lda_df_rt_over100.make_topics(filename='rt_over100', printoption=False)

  from imp import reload
