## 資料檢索與文字探勘 R12725049-pa1

In [1]:
from platform import python_version

print(python_version())

3.11.4


### Read Text Data

In [2]:
import requests

url = "https://ceiba.ntu.edu.tw/course/35d27d/content/28.txt"

try:
    response = requests.get(url)
    response.raise_for_status()  # 檢查是否有錯誤的HTTP requests

    # 獲取網頁內容
    raw_data = response.text

    print(raw_data)

except requests.exceptions.RequestException as e:
    print(f"發生錯誤: {e}")

And Yugoslav authorities are planning the arrest of eleven coal miners 
and two opposition politicians on suspicion of sabotage, that's in 
connection with strike action against President Slobodan Milosevic. 
You are listening to BBC news for The World.


###  Lowercasing everything

In [3]:
# 將文字轉換為小寫
lower_data = raw_data.lower()
print(lower_data)

and yugoslav authorities are planning the arrest of eleven coal miners 
and two opposition politicians on suspicion of sabotage, that's in 
connection with strike action against president slobodan milosevic. 
you are listening to bbc news for the world.


### Tokenization

In [4]:
def tokenize_text(text):
    # empty List 用以儲存Tokens
    tokens = []
    
    # empty String 用以儲存單字
    current_token = ""
    
    # 追蹤每個字母
    for char in text:
        # 如果字母是空格或標點符號，並且current string is not empty，則將其添加到tokens List
        if char.isspace() or char in ('.', ',', '!', '?', ';', ':'):
            if current_token:
                tokens.append(current_token)
                current_token = ""
        else:
            # 如果字母不是空格或標點符號，則將其添加到current token
            current_token += char
    
    # 將最後一個word添加到tokens列表中
    if current_token:
        tokens.append(current_token)
    
    return tokens

text = lower_data
tokens = tokenize_text(text)
print(tokens)

['and', 'yugoslav', 'authorities', 'are', 'planning', 'the', 'arrest', 'of', 'eleven', 'coal', 'miners', 'and', 'two', 'opposition', 'politicians', 'on', 'suspicion', 'of', 'sabotage', "that's", 'in', 'connection', 'with', 'strike', 'action', 'against', 'president', 'slobodan', 'milosevic', 'you', 'are', 'listening', 'to', 'bbc', 'news', 'for', 'the', 'world']


### Stopword removal

In [5]:
# 讀取stopwords
stopwords_file = open("NLTK's list of english stopwords.txt", "r")
stopwords = stopwords_file.read()
print(stopwords)

i
me
my
myself
we
our
ours
ourselves
you
your
yours
yourself
yourselves
he
him
his
himself
she
her
hers
herself
it
its
itself
they
them
their
theirs
themselves
what
which
who
whom
this
that
these
those
am
is
are
was
were
be
been
being
have
has
had
having
do
does
did
doing
a
an
the
and
but
if
or
because
as
until
while
of
at
by
for
with
about
against
between
into
through
during
before
after
above
below
to
from
up
down
in
out
on
off
over
under
again
further
then
once
here
there
when
where
why
how
all
any
both
each
few
more
most
other
some
such
no
nor
not
only
own
same
so
than
too
very
s
t
can
will
just
don
should
now



In [6]:
stopwords_list = stopwords.splitlines()
print(stopwords_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [7]:
# 使用列表過濾 stopwords
filtered_tokens = [token for token in tokens if token not in stopwords_list]

# 輸出過濾後的 token 列表
print(filtered_tokens)

['yugoslav', 'authorities', 'planning', 'arrest', 'eleven', 'coal', 'miners', 'two', 'opposition', 'politicians', 'suspicion', 'sabotage', "that's", 'connection', 'strike', 'action', 'president', 'slobodan', 'milosevic', 'listening', 'bbc', 'news', 'world']


### Stemming using Porter’s algorithm

In [8]:
# import package
from nltk.stem.porter import PorterStemmer
import nltk

# stemmer
ps=PorterStemmer()

In [9]:
# stemming
result = []
for t in filtered_tokens:
    print(t, " : ", ps.stem(t))   
    result.append(ps.stem(t))

yugoslav  :  yugoslav
authorities  :  author
planning  :  plan
arrest  :  arrest
eleven  :  eleven
coal  :  coal
miners  :  miner
two  :  two
opposition  :  opposit
politicians  :  politician
suspicion  :  suspicion
sabotage  :  sabotag
that's  :  that'
connection  :  connect
strike  :  strike
action  :  action
president  :  presid
slobodan  :  slobodan
milosevic  :  milosev
listening  :  listen
bbc  :  bbc
news  :  news
world  :  world


In [10]:
# 輸出結果
print(result)

['yugoslav', 'author', 'plan', 'arrest', 'eleven', 'coal', 'miner', 'two', 'opposit', 'politician', 'suspicion', 'sabotag', "that'", 'connect', 'strike', 'action', 'presid', 'slobodan', 'milosev', 'listen', 'bbc', 'news', 'world']


In [11]:
# 要儲存的檔案名稱
file_name = "R12725049-result.txt"

result_text = '\n'.join(result)

with open(file_name, "w") as file:
    file.write(result_text)

print(f"處理後的字串已保存到 {file_name}")

處理後的字串已保存到 R12725049-result.txt
