<a href="https://colab.research.google.com/github/syamil1/Tugas-Besar-2-Natural-Language-Processing/blob/main/Program.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punktab')

# Step 1: Load Dataset
data = pd.read_csv('bbc_news.csv')

# Display basic information about the dataset
print("Dataset Summary:")
print(data.head())

# Step 2: Assign Specific Genres
def assign_genre(description):
    description = description.lower()  # Convert text to lowercase

    # Politics
    if any(word in description for word in ['election', 'policy', 'parliament', 'government']):
        return 'Politics - Domestic'
    elif any(word in description for word in ['diplomacy', 'foreign', 'relations', 'war']):
        return 'Politics - International Relations'

    # Sports
    elif any(word in description for word in ['football', 'soccer', 'goal', 'match']):
        return 'Sports - Football'
    elif any(word in description for word in ['basketball', 'nba', 'dunk', 'three-pointer']):
        return 'Sports - Basketball'
    elif any(word in description for word in ['tennis', 'grand slam', 'serve']):
        return 'Sports - Tennis'

    # Technology
    elif any(word in description for word in ['ai', 'machine learning', 'neural network']):
        return 'Technology - Artificial Intelligence'
    elif any(word in description for word in ['smartphone', 'gadget', 'software']):
        return 'Technology - Gadgets'

    # Entertainment
    elif any(word in description for word in ['movie', 'cinema', 'box office']):
        return 'Entertainment - Movies'
    elif any(word in description for word in ['album', 'concert', 'music']):
        return 'Entertainment - Music'
    elif any(word in description for word in ['netflix', 'tv series', 'streaming']):
        return 'Entertainment - TV Shows'

    # Business
    elif any(word in description for word in ['stock', 'market', 'investment']):
        return 'Business - Stock Market'
    elif any(word in description for word in ['startup', 'entrepreneur', 'funding']):
        return 'Business - Startups'

    # Default to Other
    else:
        return 'Other'

# Apply genre assignment
data['genre'] = data['description'].apply(assign_genre)

# Step 3: Preprocessing Data
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Lowercasing
    tokens = [word.lower() for word in tokens]
    # Removing punctuation
    tokens = [word for word in tokens if word not in string.punctuation]
    # Removing stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Joining tokens back to string
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['description'].apply(preprocess_text)

# Step 4: Splitting Data
X = data['cleaned_text']
y = data['genre']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Feature Extraction
# Bag-of-Words Representation
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Step 6: Model Training and Prediction (Using Naive Bayes as Example)
model = MultinomialNB()

# Training with Bag-of-Words
model.fit(X_train_bow, y_train)
y_pred_bow = model.predict(X_test_bow)

# Training with TF-IDF
model.fit(X_train_tfidf, y_train)
y_pred_tfidf = model.predict(X_test_tfidf)

# Step 7: Evaluation
def evaluate_model(y_test, y_pred):
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", recall_score(y_test, y_pred, average='weighted'))
    print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

print("\nEvaluation with Bag-of-Words:")
evaluate_model(y_test, y_pred_bow)

print("\nEvaluation with TF-IDF:")
evaluate_model(y_test, y_pred_tfidf)

# Save processed dataset
data.to_csv('processed_news_data.csv', index=False)
print("Processed dataset saved as 'processed_news_data.csv'.")

# Step 8: View Data by Genre
def view_data_by_genre():
    unique_genres = data['genre'].unique()
    print("Available Genres:")
    for i, genre in enumerate(unique_genres, start=1):
        print(f"{i}. {genre}")

    choice = int(input("Select a genre by number: "))
    if 1 <= choice <= len(unique_genres):
        selected_genre = unique_genres[choice - 1]
        filtered_data = data[data['genre'] == selected_genre]
        print(f"\nData for Genre: {selected_genre}")
        print(filtered_data[['title', 'description', 'link']])
    else:
        print("Invalid choice. Please try again.")

view_data_by_genre()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Error loading punktab: Package 'punktab' not found in
[nltk_data]     index


Dataset Summary:
                                               title  \
0  Ukraine: Angry Zelensky vows to punish Russian...   
1  War in Ukraine: Taking cover in a town under a...   
2         Ukraine war 'catastrophic for global food'   
3  Manchester Arena bombing: Saffie Roussos's par...   
4  Ukraine conflict: Oil price soars to highest l...   

                         pubDate  \
0  Mon, 07 Mar 2022 08:01:56 GMT   
1  Sun, 06 Mar 2022 22:49:58 GMT   
2  Mon, 07 Mar 2022 00:14:42 GMT   
3  Mon, 07 Mar 2022 00:05:40 GMT   
4  Mon, 07 Mar 2022 08:15:53 GMT   

                                               guid  \
0  https://www.bbc.co.uk/news/world-europe-60638042   
1  https://www.bbc.co.uk/news/world-europe-60641873   
2      https://www.bbc.co.uk/news/business-60623941   
3            https://www.bbc.co.uk/news/uk-60579079   
4      https://www.bbc.co.uk/news/business-60642786   

                                                link  \
0  https://www.bbc.co.uk/news/world-europe

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Recall: 0.8264276386085717
F1 Score: 0.8145668625876304


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Classification Report:
                                       precision    recall  f1-score   support

                 Business - Startups       0.00      0.00      0.00        16
             Business - Stock Market       1.00      0.02      0.05        42
              Entertainment - Movies       0.00      0.00      0.00        12
               Entertainment - Music       0.86      0.09      0.16        67
            Entertainment - TV Shows       0.00      0.00      0.00        14
                               Other       0.86      0.93      0.89      5122
                 Politics - Domestic       0.81      0.84      0.82       407
  Politics - International Relations       0.86      0.50      0.63       504
                 Sports - Basketball       1.00      0.10      0.18        10
                   Sports - Football       0.57      0.78      0.66       304
                     Sports - Tennis       0.80      0.08      0.15        49
Technology - Artificial Intelligence  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Classification Report:
                                       precision    recall  f1-score   support

                 Business - Startups       0.00      0.00      0.00        16
             Business - Stock Market       0.00      0.00      0.00        42
              Entertainment - Movies       0.00      0.00      0.00        12
               Entertainment - Music       0.00      0.00      0.00        67
            Entertainment - TV Shows       0.00      0.00      0.00        14
                               Other       0.66      1.00      0.79      5122
                 Politics - Domestic       1.00      0.14      0.25       407
  Politics - International Relations       0.83      0.01      0.02       504
                 Sports - Basketball       0.00      0.00      0.00        10
                   Sports - Football       1.00      0.07      0.12       304
                     Sports - Tennis       0.00      0.00      0.00        49
Technology - Artificial Intelligence  