# 1. 실습
> **1) 뉴스 기사 제목 데이터**

In [1]:
import pandas as pd
import urllib.request
data = pd.read_csv('./abcnews-date-text.csv', error_bad_lines=False)

In [2]:
print(len(data))

1226258


In [3]:
print(data.head(5))

   publish_date                                      headline_text
0      20030219  aba decides against community broadcasting lic...
1      20030219     act fire witnesses must be aware of defamation
2      20030219     a g calls for infrastructure protection summit
3      20030219           air nz staff in aust strike for pay rise
4      20030219      air nz strike to affect australian travellers


In [4]:
text = data[['headline_text']]
text.head(5)

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


> **2) 텍스트 전처리**   

In [5]:
# 토큰화
import nltk
text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)


In [6]:
print(text.head(5))

                                       headline_text
0  [aba, decides, against, community, broadcastin...
1  [act, fire, witnesses, must, be, aware, of, de...
2  [a, g, calls, for, infrastructure, protection,...
3  [air, nz, staff, in, aust, strike, for, pay, r...
4  [air, nz, strike, to, affect, australian, trav...


In [7]:
# 불용어 제거
from nltk.corpus import stopwords
stop = stopwords.words('english')
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop)]) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop)])


In [8]:
print(text.head(5))

                                       headline_text
0   [aba, decides, community, broadcasting, licence]
1    [act, fire, witnesses, must, aware, defamation]
2     [g, calls, infrastructure, protection, summit]
3          [air, nz, staff, aust, strike, pay, rise]
4  [air, nz, strike, affect, australian, travellers]


In [9]:
# 표제어 추출
from nltk.stem import WordNetLemmatizer
text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])


In [10]:
print(text.head(5))

                                       headline_text
0       [aba, decide, community, broadcast, licence]
1      [act, fire, witness, must, aware, defamation]
2      [g, call, infrastructure, protection, summit]
3          [air, nz, staff, aust, strike, pay, rise]
4  [air, nz, strike, affect, australian, travellers]


In [11]:
# 짧은 길이 단어 제거
tokenized_doc = text['headline_text'].apply(lambda x: [word for word in x if len(word) > 3])

In [12]:
print(tokenized_doc[:5])

0       [decide, community, broadcast, licence]
1      [fire, witness, must, aware, defamation]
2    [call, infrastructure, protection, summit]
3                   [staff, aust, strike, rise]
4      [strike, affect, australian, travellers]
Name: headline_text, dtype: object


> **3) TF-IDF 생성**

In [13]:
# 역토큰화
detokenized_doc = []
for i in range(len(text)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)
    
text['headline_text'] = detokenized_doc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = detokenized_doc


In [14]:
text['headline_text'][:5]

0       decide community broadcast licence
1       fire witness must aware defamation
2    call infrastructure protection summit
3                   staff aust strike rise
4      strike affect australian travellers
Name: headline_text, dtype: object

In [15]:
# TF-IDF 구하기
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(text['headline_text'])
X.shape

(1226258, 89614)

> **4) Topic Modeling**

In [16]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components=10, learning_method='online', random_state=777, max_iter=1)
lda_top = lda_model.fit_transform(X)

In [17]:
print(lda_model.components_)
print(lda_model.components_.shape)

[[0.1        0.10000473 0.66784552 ... 0.1        0.10000159 0.10000116]
 [0.1        0.85468032 0.19223332 ... 0.1        0.10000148 0.10823208]
 [0.1        0.10000452 0.10003003 ... 0.1        0.11247695 0.1275751 ]
 ...
 [0.1        0.10000465 0.10003123 ... 0.1        0.11953043 0.10000114]
 [0.10002624 0.10000407 0.23830039 ... 0.1        0.10000137 0.10000099]
 [0.1        0.10000479 0.10003114 ... 0.1        0.10000159 0.10000113]]
(10, 89614)


In [18]:
terms = vectorizer.get_feature_names()

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(lda_model.components_, terms)

Topic 1: [('donald', 6243.16), ('open', 3636.58), ('attack', 3605.43), ('leave', 3098.97), ('monday', 2315.22)]
Topic 2: [('government', 6324.02), ('test', 4755.13), ('school', 4254.24), ('people', 3689.83), ('pandemic', 3209.08)]
Topic 3: [('coronavirus', 9662.23), ('melbourne', 5937.04), ('charge', 5254.82), ('south', 4942.83), ('police', 4826.19)]
Topic 4: [('house', 4134.07), ('woman', 3860.19), ('year', 3854.38), ('court', 3704.02), ('years', 3579.31)]
Topic 5: [('world', 4077.18), ('canberra', 4038.05), ('morrison', 3526.69), ('north', 3331.43), ('2020', 3132.1)]
Topic 6: [('news', 6263.89), ('drum', 3874.53), ('help', 3553.28), ('change', 3247.24), ('health', 3172.34)]
Topic 7: [('trump', 10386.73), ('victoria', 8066.23), ('record', 4420.69), ('kill', 4209.45), ('scott', 3208.31)]
Topic 8: [('coronavirus', 19533.15), ('covid', 9025.51), ('queensland', 8995.88), ('election', 6552.43), ('home', 4747.1)]
Topic 9: [('australia', 9756.85), ('australian', 7936.14), ('case', 7397.71), 