In [2]:
# Install necessary libraries
!pip install feedparser nltk sqlalchemy pandas scikit-learn

# Import libraries
import feedparser
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import string
from nltk.corpus import stopwords
import sqlite3
import os

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    tokens = nltk.word_tokenize(text)  # Tokenize the text
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return " ".join(tokens)

# Feed parsing and data extraction
def parse_feeds(feeds):
    articles = []
    for feed in feeds:
        try:
            parsed_feed = feedparser.parse(feed)
            for entry in parsed_feed.entries:
                content = entry.summary if 'summary' in entry else entry.description if 'description' in entry else ''
                processed_content = preprocess_text(content)
                articles.append({
                    'title': entry.title,
                    'content': processed_content,
                    'published': entry.published if 'published' in entry else 'N/A',
                    'link': entry.link
                })
        except Exception as e:
            print(f"Error parsing feed {feed}: {e}")
    return articles

# Improved categorization function with expanded keywords
def categorize_article(article):
    terrorism_keywords = ['terrorism', 'protest', 'political unrest', 'riot', 'bomb', 'attack', 'violence', 'terrorist', 'extremist']
    positive_keywords = ['positive', 'uplifting', 'success', 'achievement', 'award', 'celebration', 'milestone', 'good news', 'happy']
    disaster_keywords = ['disaster', 'earthquake', 'flood', 'hurricane', 'natural disaster', 'tsunami', 'wildfire', 'storm', 'storming', 'damage']

    # Check for category based on keywords
    if any(keyword in article for keyword in terrorism_keywords):
        return 'Terrorism / protest / political unrest / riot'
    elif any(keyword in article for keyword in positive_keywords):
        return 'Positive/Uplifting'
    elif any(keyword in article for keyword in disaster_keywords):
        return 'Natural Disasters'
    else:
        return 'Others'

# Define RSS feeds
rss_feeds = [
    'http://rss.cnn.com/rss/cnn_topstories.rss',
    'http://qz.com/feed',
    'http://feeds.foxnews.com/foxnews/politics',
    'http://feeds.reuters.com/reuters/businessNews',
    'http://feeds.feedburner.com/NewshourWorld',
    'https://feeds.bbci.co.uk/news/world/asia/india/rss.xml'
]

# Parse feeds and categorize articles
articles = parse_feeds(rss_feeds)

# Create a DataFrame
df = pd.DataFrame(articles)
df['category'] = df['content'].apply(categorize_article)

# Check category distribution
print("Initial Category Distribution:")
print(df['category'].value_counts())

# Setup SQLite database
db_path = 'news_articles.db'
if os.path.exists(db_path):
    os.remove(db_path)  # Remove existing database for fresh start

conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Create table to store news articles
cursor.execute('''
    CREATE TABLE IF NOT EXISTS news_articles (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        title TEXT UNIQUE,
        content TEXT,
        published TEXT,
        link TEXT,
        category TEXT
    )
''')
conn.commit()

# Function to insert articles into database, avoiding duplicates
def save_article_to_db(title, content, published, link, category):
    try:
        cursor.execute('''
            INSERT INTO news_articles (title, content, published, link, category)
            VALUES (?, ?, ?, ?, ?)
        ''', (title, content, published, link, category))
        conn.commit()
    except sqlite3.IntegrityError:
        pass  # Duplicate entry, ignore

# Save articles to database
for index, row in df.iterrows():
    save_article_to_db(row['title'], row['content'], row['published'], row['link'], row['category'])

# Fetch data from database
cursor.execute("SELECT title, content, category FROM news_articles")
rows = cursor.fetchall()

# Create a new DataFrame from database
df_db = pd.DataFrame(rows, columns=['title', 'content', 'category'])

# Check category distribution after database fetch
print("\nDatabase Category Distribution:")
print(df_db['category'].value_counts())

# Check if there are enough data points for training
if df_db['category'].nunique() < 2:
    print("Not enough categories for training a classifier.")
else:
    # Check class distribution
    print("\nClass Distribution:")
    print(df_db['category'].value_counts())

    # Handle class imbalance by using class weights in the classifier
    X = df_db['content']
    y = df_db['category']

    # Splitting the data for training and testing with stratification
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Vectorize the text data with bi-grams
    vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Train a Logistic Regression classifier with class weights
    model = LogisticRegression(max_iter=1000, class_weight='balanced')
    model.fit(X_train_vec, y_train)

    # Predicting categories for the test set
    y_pred = model.predict(X_test_vec)

    # Evaluate the model
    print(f"\nAccuracy: {accuracy_score(y_test, y_pred)}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Apply the model to all data
    df_db['predicted_category'] = model.predict(vectorizer.transform(df_db['content']))

    # Save the categorized articles to a CSV file
    df_db.to_csv('classified_news_articles.csv', index=False)
    print("\nNews articles categorized and saved to 'classified_news_articles.csv'")

# Display the categorized DataFrame
df_categorized = pd.read_csv('classified_news_articles.csv')
print("\nSample of Categorized Articles:")
print(df_categorized.head())

# Download the CSV file (only works in Google Colab)
from google.colab import files
files.download('classified_news_articles.csv')

# Close the database connection
conn.close()




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Initial Category Distribution:
category
Others                                           175
Natural Disasters                                  9
Positive/Uplifting                                 6
Terrorism / protest / political unrest / riot      5
Name: count, dtype: int64

Database Category Distribution:
category
Others                                           175
Natural Disasters                                  9
Positive/Uplifting                                 6
Terrorism / protest / political unrest / riot      5
Name: count, dtype: int64

Class Distribution:
category
Others                                           175
Natural Disasters                                  9
Positive/Uplifting                                 6
Terrorism / protest / political unrest / riot      5
Name: count, dtype: int64

Accuracy: 0.8813559322033898

Classification Report:
                                               precision    recall  f1-score   support

                            Natu

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>