In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
url = "https://edition.cnn.com/"


response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

    header_links = soup.find_all('a', class_='header__nav-item-link')

    headers = []

    for header in header_links:
        headers.append(header.text.strip())

    for header in headers:
        print(header)
else:
    print("Failed to retrieve the web page")


US
World
Politics
Business
Opinion
Health
Entertainment
Style
Travel
Sports
Video
More


In [3]:
headers = headers[:-1]

In [4]:
headers1 = headers[:-1]

In [5]:
headers2 = [header.lower() for header in headers1]
print(headers2)

['us', 'world', 'politics', 'business', 'opinion', 'health', 'entertainment', 'style', 'travel', 'sports']


In [6]:
slash_url = "https://edition.cnn.com/"
noslash_url = "https://edition.cnn.com" #remove last slash from slash_url for scraping headers and jumping to that page to scrape articles
url = slash_url

all_headlines = []

for header in headers2:
    full_url = slash_url + header
    response = requests.get(full_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        container_elements = soup.find_all('a', class_="container__link container__link--type-article container_lead-plus-headlines__link")  # Adjust class as needed

        for container_element in container_elements:
            headline_text = container_element.text.strip()

            url = container_element.get('href')

            if not url.startswith('http'):
                url = noslash_url + url

            content_response = requests.get(url)

            if content_response.status_code == 200:
                content_soup = BeautifulSoup(content_response.text, 'html.parser')

                content_elements = content_soup.find_all('p')
                content_text = ' '.join([p.text for p in content_elements])

                content_text = content_text.replace('\n', ' ')

                all_headlines.append((header, headline_text, url, content_text))
            else:
                print(f"Failed to retrieve content for {url}")
    else:
        print(f"Failed to retrieve the web page for {header}")

df = pd.DataFrame(all_headlines, columns=["Header", "Headline", "URL", "Content"])

print(df)

Failed to retrieve content for https://edition.cnn.com/cnn-underscored/travel/what-is-allowed-in-carry-on-bags
Failed to retrieve content for https://edition.cnn.com/cnn-underscored/travel/tsa-battery-rules
Failed to retrieve content for https://edition.cnn.com/cnn-underscored/health-fitness/tennis-champion-coco-gauff-fitness-essentials
    Header                                           Headline  \
0       us                Alaska Department of Transportation   
1       us  A girl and 2 adults found dead after large Ala...   
2       us  Man who lived frugally leaves unexpected gift ...   
3       us  Shooter wounds 4 at Walmart near Dayton, Ohio,...   
4       us  Suspect in Colorado mass shooting that left 3 ...   
..     ...                                                ...   
212  style  Long-lost $26 million masterpiece found in kit...   
213  style                             Cindy Ord/Getty Images   
214  style  The former host of ‘Reading Rainbow’ used to e...   
215  style 

In [7]:
df['Content'].iloc[1]

'       Three people have been found dead, including a young girl, and others are still missing after a large landslide covered a highway near a remote community in southeast Alaska on Monday,\xa0according to the state’s\xa0Department\xa0of Public Safety.           The landslide was reported Monday around 9 p.m. local time on the Zimovia Highway near Wrangell, and walloped three homes in its path, department officials said. Alaska’s Department of Transportation said the path grew to an estimated 450 feet wide and had a significant debris field.           A girl was found dead during initial search and rescue efforts Monday night, and the remains of two adults were located by a drone operator Tuesday and recovered, according to the department.           A woman was rescued from the slide area Tuesday morning, but officials believe two juveniles and an adult are still unaccounted for.           State troopers are leading ongoing search and rescue efforts,\xa0although the ground search wa

In [8]:
df['URL'].iloc[1]

'https://edition.cnn.com/2023/11/21/us/alaska-wrangell-landslide/index.html'

In [16]:
df.to_csv('/content/drive/MyDrive/scraped_data.csv', index=False)

In [10]:
df.shape

(217, 4)

In [11]:
df['Header'].value_counts()

world       75
business    35
us          28
politics    24
health      24
opinion     19
style       12
Name: Header, dtype: int64

In [12]:
labels = df['Header']

X = df[['Content', 'Headline']]
y = labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=10000)

X_train_tfidf = vectorizer.fit_transform(X_train['Content'] + ' ' + X_train['Headline'])

X_test_tfidf = vectorizer.transform(X_test['Content'] + ' ' + X_test['Headline'])

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_nb = {
    'alpha': [0.1, 0.5, 1.0, 2.0]
}

param_grid_svc = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

param_grid_lr = {
    'C': [0.1, 1, 10],
    'max_iter': [50, 100, 200],
    'solver': ['liblinear', 'lbfgs']
}


classifiers = {
    'RandomForest': (RandomForestClassifier(), param_grid_rf),
    'NaiveBayes': (MultinomialNB(), param_grid_nb),
    'SVM': (SVC(), param_grid_svc),
    'LogisticRegression': (LogisticRegression(), param_grid_lr)
}

for clf_name, (clf, param_grid) in classifiers.items():
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_tfidf, y_train)

    print(f"Best Parameters for {clf_name}: ", grid_search.best_params_)
    print(f"Best Accuracy for {clf_name}: ", grid_search.best_score_)

    best_classifier = grid_search.best_estimator_

    X_test_tfidf = vectorizer.transform(X_test['Content'] + ' ' + X_test['Headline'])

    y_pred = best_classifier.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    print(f'Accuracy for {clf_name}: {accuracy}')
    print(classification_rep)
    print("\n")

Best Parameters for RandomForest:  {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best Accuracy for RandomForest:  0.6872268907563026
Accuracy for RandomForest: 0.6136363636363636
              precision    recall  f1-score   support

    business       0.90      1.00      0.95         9
      health       1.00      0.33      0.50         6
     opinion       1.00      1.00      1.00         3
    politics       0.67      0.29      0.40         7
       style       0.50      0.33      0.40         3
          us       0.00      0.00      0.00         5
       world       0.43      0.91      0.59        11

    accuracy                           0.61        44
   macro avg       0.64      0.55      0.55        44
weighted avg       0.64      0.61      0.57        44



Best Parameters for NaiveBayes:  {'alpha': 0.1}
Best Accuracy for NaiveBayes:  0.6065546218487395
Accuracy for NaiveBayes: 0.6136363636363636
              precision    recall  f1-sco

In [13]:
best_classifier = grid_search.best_estimator_

y_pred_best = best_classifier.predict(X_test_tfidf)

accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)

results_df_best = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred_best
})

results_df_best.to_excel('/content/drive/MyDrive/best_result.xlsx', index=False)

print(f'Accuracy for Best Model: {accuracy_best}')
print(classification_rep_best)

Accuracy for Best Model: 0.7045454545454546
              precision    recall  f1-score   support

    business       0.90      1.00      0.95         9
      health       1.00      0.33      0.50         6
     opinion       1.00      1.00      1.00         3
    politics       0.75      0.43      0.55         7
       style       0.50      0.33      0.40         3
          us       0.75      0.60      0.67         5
       world       0.53      0.91      0.67        11

    accuracy                           0.70        44
   macro avg       0.78      0.66      0.68        44
weighted avg       0.76      0.70      0.69        44



In [15]:
results_df_best

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44 entries, 205 to 93
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Actual     44 non-null     object
 1   Predicted  44 non-null     object
dtypes: object(2)
memory usage: 1.0+ KB
