**LOADING THE DATASET**

In [154]:
import pandas as pd
import numpy as np

In [155]:
data=pd.read_csv('/content/BBC News.csv')

In [156]:
data

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business
...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment
1487,1590,weak dollar hits reuters revenues at media gro...,business
1488,1587,apple ipod family expands market apple has exp...,tech


In [157]:
data['Category'].value_counts()

sport            346
business         336
politics         274
entertainment    273
tech             261
Name: Category, dtype: int64

In [158]:
data.isna().sum()

ArticleId    0
Text         0
Category     0
dtype: int64

**DATA PRE-PROCESSING**

In [159]:
from bs4 import BeautifulSoup
import re

**REMOVING HTML STRIPS, SQUARE BRACKET, NOISY DATA**

In [164]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

In [165]:
data['Text'] = data['Text'].apply(strip_html)

In [166]:
data['Text'] = data['Text'].apply(remove_between_square_brackets)

In [167]:
data['Text'] = data['Text'].apply(denoise_text)

**REMOVE SPECIAL CHARACTER**

In [168]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\^^", "", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    return text

In [169]:
data['Text'] = data['Text'].apply(remove_special_characters)

In [170]:
import nltk

**STEMMING AND LEMMATIZATION**

In [171]:
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

In [172]:
data['Text'] = data['Text'].apply(simple_stemmer)

In [173]:
from nltk.stem.wordnet import WordNetLemmatizer

In [174]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [175]:
#Lemmatizing the text
def lematize(text):
    lem=WordNetLemmatizer()
    text= ' '.join([lem.lemmatize(word) for word in text.split()])
    return text

In [176]:
data['Text'] = data['Text'].apply(lematize)

**TOKENIZATION AND STOPWORD REMOVAL**

In [177]:
from nltk.tokenize.toktok import ToktokTokenizer
#Tokenization of text
tokenizer1=ToktokTokenizer()

In [178]:
nltk.download('stopwords')
  

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [179]:
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

In [180]:
#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer1.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [181]:
data['Text'] = data['Text'].apply(remove_stopwords)

**TEXT VECTORIZATION**

In [182]:
from sklearn.feature_extraction.text import TfidfVectorizer,ENGLISH_STOP_WORDS

In [183]:
vect = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS).fit(data.Text)

In [184]:
X=vect.transform(data.Text)

In [185]:
Text_new = pd.DataFrame(X.toarray(),columns=vect.get_feature_names_out())

In [186]:
Text_new

Unnamed: 0,00,000,0001,00051,000acr,000ayear,000bn,000m,000seater,000strong,...,zombi,zone,zonealarm,zoom,zooropa,zorro,zuluaga,zurich,zuton,zvonareva
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.024183,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.018537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1485,0.0,0.033323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1486,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1487,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1488,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**MODEL BUILDING**

In [187]:
x=Text_new
y=data['Category']

In [188]:
from sklearn.model_selection import train_test_split

In [189]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

**LOGISTIC TREGRESSION**

In [190]:
from sklearn.linear_model import LogisticRegression

In [191]:
lr=LogisticRegression()
lr.fit(x_train,y_train)
y_pred=lr.predict(x_test)

In [192]:
from sklearn.metrics import accuracy_score
print('ACCURACY IS',accuracy_score(y_test,y_pred))

ACCURACY IS 0.9697986577181208


**DECISION TREE CLASSIFIER**

In [193]:
from sklearn.tree import DecisionTreeClassifier
dt_classifier=DecisionTreeClassifier()
model1=dt_classifier.fit(x_train,y_train)
predictions=model1.predict(x_test)

In [194]:
print('ACCURACY IS',accuracy_score(y_test,predictions))

ACCURACY IS 0.8523489932885906


**RANDOM FOREST CLASSIFIER**

In [197]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier=RandomForestClassifier()
model2=rf_classifier.fit(x_train,y_train)
predictions=model2.predict(x_test)

In [198]:
print('ACCURACY IS',accuracy_score(y_test,predictions))

ACCURACY IS 0.9731543624161074


In [None]:
## Random Forest classifier gives highest accuracy value.