In [1]:
import pandas as pd
import nltk
import string
import joblib
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df=pd.read_csv('/content/bbc_news_scraping.csv')

In [3]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [4]:
df.head()

Unnamed: 0,Title,Link,Category
0,Biggest Russian bombardment of war kills 30 in...,/news/world-europe-67843312,['hero1|headline']
1,Russian missile 'flew into Poland then Ukraine',/news/world-europe-67839340,['hero2|headline']
2,Moment baby pulled alive from rubble after air...,/news/world-middle-east-67843832,['hero3|headline']
3,Broad and Lionesses recognised in New Year Hon...,/sport/67835939,['hero4|headline']
4,Nasa mission lines up to 'touch the Sun',/news/science-environment-67837161,['hero5|headline']


In [5]:
stopwords=set(stopwords.words('english'))

In [6]:
#Removing stopwords
def clean(text):
  words=text.split()
  words=[word for word in words if word.lower() not in stopwords]
  return ' '.join(words)

In [7]:
df['Title']=df['Title'].apply(clean)

In [8]:
df.head()

Unnamed: 0,Title,Link,Category
0,Biggest Russian bombardment war kills 30 Ukraine,/news/world-europe-67843312,['hero1|headline']
1,Russian missile 'flew Poland Ukraine',/news/world-europe-67839340,['hero2|headline']
2,Moment baby pulled alive rubble air strike Gaza,/news/world-middle-east-67843832,['hero3|headline']
3,Broad Lionesses recognised New Year Honours,/sport/67835939,['hero4|headline']
4,Nasa mission lines 'touch Sun',/news/science-environment-67837161,['hero5|headline']


In [9]:
english_punctuations = string.punctuation
punctuations_list = english_punctuations

In [10]:
#Removing punctuation
def clean_punc(text):
  return ''.join(x for x in text if x not in punctuations_list)

In [11]:
df['Title']=df['Title'].apply(lambda x:clean_punc(x))

In [12]:
df.head()

Unnamed: 0,Title,Link,Category
0,Biggest Russian bombardment war kills 30 Ukraine,/news/world-europe-67843312,['hero1|headline']
1,Russian missile flew Poland Ukraine,/news/world-europe-67839340,['hero2|headline']
2,Moment baby pulled alive rubble air strike Gaza,/news/world-middle-east-67843832,['hero3|headline']
3,Broad Lionesses recognised New Year Honours,/sport/67835939,['hero4|headline']
4,Nasa mission lines touch Sun,/news/science-environment-67837161,['hero5|headline']


In [13]:
#Tokenization
tokenizer=RegexpTokenizer(r'\w+')

In [14]:
df['Title']=df['Title'].apply(tokenizer.tokenize)

In [15]:
df.head()

Unnamed: 0,Title,Link,Category
0,"[Biggest, Russian, bombardment, war, kills, 30...",/news/world-europe-67843312,['hero1|headline']
1,"[Russian, missile, flew, Poland, Ukraine]",/news/world-europe-67839340,['hero2|headline']
2,"[Moment, baby, pulled, alive, rubble, air, str...",/news/world-middle-east-67843832,['hero3|headline']
3,"[Broad, Lionesses, recognised, New, Year, Hono...",/sport/67835939,['hero4|headline']
4,"[Nasa, mission, lines, touch, Sun]",/news/science-environment-67837161,['hero5|headline']


In [16]:
#Stemming
st=PorterStemmer()

In [17]:
def stemming(text):
  stemmed_words=[st.stem(word) for word in text]
  return ' '.join(stemmed_words)

In [18]:
df['Title']=df['Title'].apply(stemming)

In [19]:
df.head()

Unnamed: 0,Title,Link,Category
0,biggest russian bombard war kill 30 ukrain,/news/world-europe-67843312,['hero1|headline']
1,russian missil flew poland ukrain,/news/world-europe-67839340,['hero2|headline']
2,moment babi pull aliv rubbl air strike gaza,/news/world-middle-east-67843832,['hero3|headline']
3,broad lioness recognis new year honour,/sport/67835939,['hero4|headline']
4,nasa mission line touch sun,/news/science-environment-67837161,['hero5|headline']


In [20]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [21]:
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Title'])

In [22]:
dense_tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [23]:
num_clusters = 4

# Apply K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(tfidf_matrix)
cluster_labels



array([3, 3, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1,
       1, 0, 1, 1, 1, 2, 2, 1, 1, 3, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1], dtype=int32)

In [24]:
df['Topic']=cluster_labels

In [25]:
cluster_dictionary={0:'Photos',1:'Others',2:'Watch_Video',3:'Russia'}

In [26]:
df['Topic']=df['Topic'].replace(cluster_dictionary)

In [27]:
df.head()

Unnamed: 0,Title,Link,Category,Topic
0,biggest russian bombard war kill 30 ukrain,/news/world-europe-67843312,['hero1|headline'],Russia
1,russian missil flew poland ukrain,/news/world-europe-67839340,['hero2|headline'],Russia
2,moment babi pull aliv rubbl air strike gaza,/news/world-middle-east-67843832,['hero3|headline'],Photos
3,broad lioness recognis new year honour,/sport/67835939,['hero4|headline'],Others
4,nasa mission line touch sun,/news/science-environment-67837161,['hero5|headline'],Others


In [28]:
X_train, X_test, y_train, y_test = train_test_split(df['Title'], df['Topic'], test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

y_pred = classifier.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.90

Classification Report:
              precision    recall  f1-score   support

      Others       0.90      1.00      0.95         9
      Russia       0.00      0.00      0.00         1

    accuracy                           0.90        10
   macro avg       0.45      0.50      0.47        10
weighted avg       0.81      0.90      0.85        10



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
joblib.dump((classifier,tfidf_vectorizer),'news_classifier.joblib')

['news_classifier.joblib']