# Text Mining Project

This project performs basic text mining on an online news article. It downloads the article text from a user-provided URL, processes it using simple natural language techniques, summarizes the content, extracts key points, and allows the user to ask questions about the article.

---

## Main Steps

1. **Download article text**
   The program uses the `trafilatura` library to fetch and extract readable text from a webpage.

2. **Tokenize the text**
   Using NLTK, the article is split into sentences and words for further processing.

3. **Generate a summary**
   The first few sentences of the article are returned as a basic extractive summary.

4. **Identify key points**
   The program selects the most informative sentences by scoring them based on the number of unique words they contain.

5. **Answer user questions**
   When the user enters a question, the program compares the question words with each sentence in the article and returns the sentences with the highest overlap.

6. **Interactive mode**
   The user can ask multiple questions until choosing to stop.


### When trying it out run this in a seperate cell: `!pip install trafilatura nltk`

In [None]:
!pip install trafilatura nltk



In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import trafilatura

nltk.download("punkt_tab")

# Get article
def extract_article_text(url):
    downloaded = trafilatura.fetch_url(url)
    text = trafilatura.extract(downloaded)


# Summary
def summary(sentences, n=5):
    return sentences[:n]

# Keypoints
def extract_keypoints(sentences, top_k=3):
    scored = []

    for s in sentences:
        tokens = word_tokenize(s)
        words = []
        for w in tokens:
            w = w.lower()
            if w.isalpha():
                words.append(w)

        score = len(set(words))
        scored.append((score, s))

    # sort by score, best first
    scored.sort(reverse=True, key=lambda x: x[0])

    keypoints = []
    for score, sent in scored[:top_k]:
        keypoints.append(sent)

    return keypoints

# Question answering
def answer_question(question, sentences):

    q_tokens = word_tokenize(question)
    q_words = []
    for w in q_tokens:
        w = w.lower()
        if w.isalpha():
            q_words.append(w)

    scored_sentences = []

    for s in sentences:
        s_tokens = word_tokenize(s)
        s_words = []
        for w in s_tokens:
            w = w.lower()
            if w.isalpha():
                s_words.append(w)
        score = 0
        for qw in q_words:
            if qw in s_words:
                score += 1

        scored_sentences.append((score, s))
    scored_sentences.sort(reverse=True, key=lambda item: item[0])

    answers = []
    for score, sent in scored_sentences:
        if score > 0:
            answers.append(sent)
        if len(answers) == 1:
            break

    return answers

def main():
    print("Text mining from articles\n")

    url = input("Enter the article URL: ")

    article = extract_article_text(url)

    if article == "":
        print("Could not get article text. Try another URL.")
        return

    # Sentence tokenization
    print("\nSentence Tokenization")
    sentences = sent_tokenize(article)
    for s in sentences[:10]:  # show first 10 sentences
        print("-", s)

    # Word tokenization
    print("\nWord Tokenization")
    words = word_tokenize(article)
    print(words[:30])  # show first 30 words

    # Summary
    print("\nSummary")
    summary_text = summary(sentences, n=5)
    for s in summary_text:
        print("-", s)

    # Key points
    print("\nKey points")
    keypoints = extract_keypoints(sentences, top_k=3)
    for kp in keypoints:
        print("-", kp)

    # Question loop
    print("\nAsk questions about the article.")
    print("Press Enter on an empty line to stop.\n")

    while True:
        question = input("Your question: ").strip()
        if question == "":
            break

        answers = answer_question(question, sentences)

        if answers:
            print("\nPossible answer:")
            for a in answers:
                print("-", a)
        else:
            print("No relevant answer found.")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Text mining from articles

