# 1. Mount Drive & Import Library

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import nltk
import re
import networkx as nx

Mounted at /content/drive


# 2. Download Resource NLTK

In [2]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# 3. Load Dataset

In [3]:
DATA_PATH = "/content/drive/My Drive/Colab Notebooks/SistemTemuKembaliInformasi/Minggu11/data/tennis_articles_v4.csv"

df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,article_id,article_text,source
0,1,Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,"BASEL, Switzerland (AP), Roger Federer advance...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...


# A. Sentence Ranking Summarization (Graph-Based)

# 4. Gabungkan & Tokenisasi Kalimat

In [4]:
from nltk.tokenize import sent_tokenize

text = " ".join(df['article_text'])
sentences = sent_tokenize(text)

# 5. Cleaning Kalimat

In [5]:
clean_sentences = []
sentence_map = {}

for s in sentences:
    clean = re.sub("[^a-zA-Z]", " ", s).lower()
    clean_sentences.append(clean)
    sentence_map[clean] = s

# 6. Fungsi Cosine Similarity Antar Kalimat

In [8]:
from nltk.cluster.util import cosine_distance
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

def sentence_similarity(sent1, sent2):
    sent1 = sent1.split()
    sent2 = sent2.split()
    all_words = list(set(sent1 + sent2))

    v1 = [sent1.count(w) if w not in stop_words else 0 for w in all_words]
    v2 = [sent2.count(w) if w not in stop_words else 0 for w in all_words]

    if sum(v1) == 0 or sum(v2) == 0:
        return 0

    return 1 - cosine_distance(v1, v2)

# 7. Similarity Matrix

In [9]:
similarity_matrix = np.zeros((len(clean_sentences), len(clean_sentences)))

for i in range(len(clean_sentences)):
    for j in range(len(clean_sentences)):
        if i != j:
            similarity_matrix[i][j] = sentence_similarity(
                clean_sentences[i], clean_sentences[j]
            )

# 8. PageRank & Ringkasan

In [10]:
graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(graph)

ranked = sorted(((scores[i], s) for i, s in enumerate(clean_sentences)), reverse=True)

print("Sentence Ranking Summary:\n")
print(sentence_map[ranked[0][1]])

Sentence Ranking Summary:

Federer won the Swiss Indoors last week by beating Romanian qualifier Marius Copil in the final.


# B. TF-IDF Based Summarization

# 9. Tokenisasi Kalimat

In [11]:
sentences = sent_tokenize(text)

# 10. Cleaning & Mapping

In [12]:
clean_text = ""
sentence_map = {}

for s in sentences:
    clean = re.sub("[^a-zA-Z]", " ", s).lower()
    sentence_map[clean] = s
    clean_text += clean

# 11. Hitung Word Frequency (TF)

In [13]:
from nltk.corpus import stopwords

stop_words = stopwords.words("english")
word_freq = {}

for word in nltk.word_tokenize(clean_text):
    if word not in stop_words:
        word_freq[word] = word_freq.get(word, 0) + 1

max_freq = max(word_freq.values())
for w in word_freq:
    word_freq[w] /= max_freq

# 12. Skor Kalimat

In [14]:
sentence_scores = {}

for sent in sentences:
    for word in nltk.word_tokenize(sent.lower()):
        if word in word_freq and len(sent.split()) < 30:
            sentence_scores[sent] = sentence_scores.get(sent, 0) + word_freq[word]

# 13. Hasil Ringkasan TF-IDF

In [15]:
import heapq

summary = heapq.nlargest(5, sentence_scores, key=sentence_scores.get)

print("TF-IDF Summary:\n")
print(" ".join(summary))

TF-IDF Summary:

Federer has been handed a difficult draw where could could come across Kevin Anderson, Novak Djokovic and Rafael Nadal in the latter rounds. He used his first break point to close out the first set before going up 3-0 in the second and wrapping up the win on his first match point. Federer's projected route to the Paris final could also lead to matches against Kevin Anderson and Novak Djokovic. Federer won the Swiss Indoors last week by beating Romanian qualifier Marius Copil in the final. BASEL, Switzerland (AP), Roger Federer advanced to the 14th Swiss Indoors final of his career by beating seventh-seeded Daniil Medvedev 6-1, 6-4 on Saturday.


# 14. Kesimpulan


### Sentence Ranking menghasilkan ringkasan yang sangat singkat dan langsung pada inti informasi utama, sedangkan TF-IDF menghasilkan ringkasan yang lebih panjang dan kaya konteks karena mempertimbangkan bobot kata penting, namun kurang ringkas dan cenderung mengandung detail tambahan.