In [None]:
import pandas as pd
import numpy as np
import unicodedata
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import spacy

# Load the language model
nlp = spacy.load("en_core_web_sm")


In [None]:
df = pd.read_csv("news.csv")
# Drop the 'published_at' and 'topic' columns
df.drop(columns=['published_at', 'topic'], inplace=True)
df.head()

Unnamed: 0,title,content,source
0,BTS: RM is reminded of Bon Voyage as he travel...,"After reaching his hotel in the city, RM revea...",2
1,RM recalls wondering if he 'made right decisio...,RM aka Kim Namjoon was the first member to joi...,2
2,BTS: J-Hope and RM go bonkers at Billie Eilish...,"Billie Eilish's concert was held in Seoul, Sou...",1
3,"BTS: J-Hope proudly states he raised Jungkook,...",BTS ARMY y'all would be missing the members a ...,1
4,BTS: Jin aka Kim Seokjin takes us through the ...,BTS member Kim Seokjin aka Jin has the capacit...,1


In [None]:
# Checking the dimensions of the dataframe
print(df.shape)
# Removing any rows with missing or NaN values
df = df.dropna()
# Checking for duplication and removing them
df = df.drop_duplicates()
# Checking the dimensions of the dataframe again
print(df.shape)

(810, 3)
(806, 3)


In [None]:
df = df.fillna(df.mean()) 
df = df.fillna(df.median()) 
df = df.fillna(df.mode().iloc[0]) 

  df = df.fillna(df.mean())
  df = df.fillna(df.median())


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df.content, 
    df.source, 
    test_size=0.1, #10% test set
    random_state=2022,
    stratify=df.source
)

In [None]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (725,)
Shape of X_test:  (81,)


In [None]:
#Using KNN
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('KNN', KNeighborsClassifier())         
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

#print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.78      0.86      0.82        29
           2       0.85      0.76      0.80        29
           3       0.87      0.87      0.87        23

    accuracy                           0.83        81
   macro avg       0.83      0.83      0.83        81
weighted avg       0.83      0.83      0.83        81



In [None]:
#Using Multi NB
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('Multi NB', MultinomialNB())         
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

#print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        29
           2       0.91      1.00      0.95        29
           3       1.00      0.87      0.93        23

    accuracy                           0.96        81
   macro avg       0.97      0.96      0.96        81
weighted avg       0.97      0.96      0.96        81



In [None]:
#Using Random Forest
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),       
     ('Random Forest', RandomForestClassifier())         
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

#print the classfication report
print(classification_report(y_test, y_pred))

In [None]:
# remove stop words and lemmatize the text
def preprocess(text):
    removed_tokens = []
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            removed_tokens.append(token)
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens), removed_tokens

In [None]:
df['Original_Content'] = df['content']
df['New_Content'] = df['content'].apply(lambda x: preprocess(x)[0])
df['Removed_Words'] = df['content'].apply(lambda x: preprocess(x)[1])

In [None]:
df[['Original_Content', 'New_Content','Removed_Words']].head()

Unnamed: 0,Original_Content,New_Content,Removed_Words
0,"After reaching his hotel in the city, RM revea...",reach hotel city RM reveal stay day add step d...,"[After, his, in, the, ,, that, his, would, be,..."
1,RM aka Kim Namjoon was the first member to joi...,RM aka Kim Namjoon member join BTS group relea...,"[was, the, first, to, ., The, their, on, ,, .,..."
2,"Billie Eilish's concert was held in Seoul, Sou...",Billie Eilish concert hold Seoul South Korea a...,"['s, was, in, ,, and, it, was, by, ', and, -, ..."
3,BTS ARMY y'all would be missing the members a ...,BTS ARMY you miss member lot right BTS member ...,"[all, would, be, the, a, ,, ?, Well, ,, one, o..."
4,BTS member Kim Seokjin aka Jin has the capacit...,bts member Kim Seokjin aka Jin capacity create...,"[has, the, to, ., This, has, through, so, in, ..."


In [None]:
df.content[0]

'After reaching his hotel in the city, RM revealed that his stay would be for four days and added that he would step out for dinner. As he sat at a roadside open-air restaurant, RM feasted on beer, burgers and fries. He said, "I\'m starving right now. I\'m out to grab some food. It\'s much quieter than I expected and feels like a rural town. I like the familiar atmosphere." RM attended Art Basel and explained on camera the details of the art fair. He also gave a glimpse as he had noodles and beer which was followed by soup noodles and wrap. Showing the pattern of a ping pong table, RM said, "The table looks like our (BTS) symbol." He also spoke about the art pieces as he viewed them. After that, RM took a tram to visit the Foundation Beyeler, a museum. He later took a walk through the city. On his third day, RM visited the Kunstmuseum Basel, the Vitra Design Museum and the gallery. As he walked around, RM showed a chair to his fans and said, "I have breaking news for you guys. Coldplay

In [None]:
df.New_Content[0]

"reach hotel city RM reveal stay day add step dinner sit roadside open air restaurant RM feast beer burger fry say starve right grab food quieter expect feel like rural town like familiar atmosphere RM attend Art Basel explain camera detail art fair give glimpse noodle beer follow soup noodle wrap show pattern ping pong table RM say table look like BTS symbol speak art piece view RM take tram visit Foundation Beyeler museum later take walk city day RM visit Kunstmuseum Basel Vitra Design Museum gallery walk RM show chair fan say break news guy Coldplay Chris Martin chair display Vitra Design Museum Chris amazing RM visit Lucerne hike Mount Rigi recall previous visit Lucerne RM add remember day cross bridge buying souvenir remind Bon Voyage reality feature BTS member RM Jin Suga J Hope Jimin V Jungkook speak camera RM say ride SSB train Lucerne ride boat ride mountain train walk track road ride cable car boat plan ride SSB RM travel Switzerland end visit Museum Tinguely RM fly Paris att

In [None]:
#Building the model with newly pre processed text
X_train, X_test, y_train, y_test = train_test_split(
    df.New_Content, 
    df.content,
    test_size=0.10, 
    random_state=2022,
    stratify=df.source
)

In [None]:
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),         
     ('Random Forest', RandomForestClassifier())         
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])