In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
file_path=('/content/drive/MyDrive/Datasets/BBC News Train.csv')
df=pd.read_csv(file_path)
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB


In [5]:
df.isnull().sum()

Unnamed: 0,0
ArticleId,0
Text,0
Category,0


In [27]:
df['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
sport,346
business,336
politics,274
entertainment,273
tech,261


#Normalize the text

In [8]:
import string
def normalize_text(text):
  text=text.lower()
  text=text.translate(str.maketrans('','',string.punctuation))
  return text


Tokenization and stop word removal

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
#define a function to remove stop words
def remove_stopwords(text):
  stop_words=set(stopwords.words('english'))
  #tokenize the word
  words=word_tokenize(text)
  #remove stop wrods
  filtered_words=[word for word in words if word not in stop_words]
  return filtered_words

# df['Text']=df['Text'].apply(remove_stopwords)
# df.head()

Apply stemming and lemmatization to the tokenized text data.

In [11]:
from nltk.stem import PorterStemmer,WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [12]:
def apply_stemming(words):
  stemmer=PorterStemmer()
  stem_word=[stemmer.stem(word) for word in words]
  return stem_word

def apply_lemmatization(words):
  lemmatizer=WordNetLemmatizer()
  lemma_word=[lemmatizer.lemmatize(word) for word in words]
  return lemma_word







In [13]:
df['Text'] = df['Text'].apply(normalize_text)
df['Text'] = df['Text'].apply(remove_stopwords)
df['text_stemmed'] = df['Text'].apply(apply_stemming)
df['text_lemmatized'] = df['Text'].apply(apply_lemmatization)

# Join the words back into a single string for each row
df['text_stemmed'] = df['text_stemmed'].apply(lambda x: ' '.join(x))
df['text_lemmatized'] = df['text_lemmatized'].apply(lambda x: ' '.join(x))



df.head()

Unnamed: 0,ArticleId,Text,Category,text_stemmed,text_lemmatized
0,1833,"[worldcom, exboss, launches, defence, lawyers,...",business,worldcom exboss launch defenc lawyer defend fo...,worldcom exboss launch defence lawyer defendin...
1,154,"[german, business, confidence, slides, german,...",business,german busi confid slide german busi confid fe...,german business confidence slide german busine...
2,1101,"[bbc, poll, indicates, economic, gloom, citize...",business,bbc poll indic econom gloom citizen major nati...,bbc poll indicates economic gloom citizen majo...
3,1976,"[lifestyle, governs, mobile, choice, faster, b...",tech,lifestyl govern mobil choic faster better funk...,lifestyle governs mobile choice faster better ...
4,917,"[enron, bosses, 168m, payout, eighteen, former...",business,enron boss 168m payout eighteen former enron d...,enron boss 168m payout eighteen former enron d...


Transform the processed text data into numerical vectors using
CountVectorizer.

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer()
x=vectorizer.fit_transform(df['text_lemmatized'])
X_df = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
X_df.head()


Unnamed: 0,00,000,0001,00051,000acre,000ayear,000bn,000m,000seater,000strong,...,zombie,zone,zonealarm,zoom,zooropa,zorro,zuluaga,zurich,zutons,zvonareva
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
def create_ngrams(text_data,n):
  vectorizer=CountVectorizer(ngram_range=(n,n))
  x=vectorizer.fit_transform(text_data)
  ngram_df=pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names_out())
  return ngram_df




unigram=create_ngrams(df['text_lemmatized'],1)
print(unigram.head())
bigrams=create_ngrams(df['text_lemmatized'],2)
print(bigrams.head())
trigrams=create_ngrams(df['text_lemmatized'],3)
print(trigrams.head())



   00  000  0001  00051  000acre  000ayear  000bn  000m  000seater  000strong  \
0   0    0     0      0        0         0      0     0          0          0   
1   0    0     0      0        0         0      0     0          0          0   
2   0    1     0      0        0         0      0     0          0          0   
3   0    1     0      0        0         0      0     0          0          0   
4   0    0     0      0        0         0      0     0          0          0   

   ...  zombie  zone  zonealarm  zoom  zooropa  zorro  zuluaga  zurich  \
0  ...       0     0          0     0        0      0        0       0   
1  ...       0     0          0     0        0      0        0       0   
2  ...       0     0          0     0        0      0        0       0   
3  ...       0     0          0     0        0      0        0       0   
4  ...       0     0          0     0        0      0        0       0   

   zutons  zvonareva  
0       0          0  
1       0          0  

In [42]:
from sklearn.model_selection import train_test_split
target = df['Category']
x_train,x_test,y_train,y_test=train_test_split(unigram,target,test_size=0.2,random_state=42)


print(f"X_train shape: {x_train.shape}")
print(f"X_test shape: {x_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (1192, 24558)
X_test shape: (298, 24558)
y_train shape: (1192,)
y_test shape: (298,)


#Logistic Regression

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.naive_bayes import MultinomialNB
log_reg=LogisticRegression(max_iter=1000)

log_reg.fit(x_train,y_train)
y_pred=log_reg.predict(x_test)


accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test,y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test,y_pred))

Accuracy: 0.9664429530201343
Classification Report:
               precision    recall  f1-score   support

     business       0.96      0.99      0.97        75
entertainment       0.94      1.00      0.97        46
     politics       0.96      0.93      0.95        56
        sport       0.97      1.00      0.98        63
         tech       1.00      0.91      0.95        58

     accuracy                           0.97       298
    macro avg       0.97      0.97      0.97       298
 weighted avg       0.97      0.97      0.97       298

Confusion Matrix:
[[74  0  1  0  0]
 [ 0 46  0  0  0]
 [ 2  1 52  1  0]
 [ 0  0  0 63  0]
 [ 1  2  1  1 53]]


#Naive Bayes

In [44]:
nb = MultinomialNB()
nb.fit(x_train, y_train)

# Predict on the test set
y_pred_nb = nb.predict(x_test)

# Evaluate Naive Bayes
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))
print("Naive Bayes Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_nb))

Naive Bayes Accuracy: 0.9832214765100671
Naive Bayes Classification Report:
               precision    recall  f1-score   support

     business       0.99      0.97      0.98        75
entertainment       1.00      0.98      0.99        46
     politics       0.96      0.98      0.97        56
        sport       1.00      1.00      1.00        63
         tech       0.97      0.98      0.97        58

     accuracy                           0.98       298
    macro avg       0.98      0.98      0.98       298
 weighted avg       0.98      0.98      0.98       298

Naive Bayes Confusion Matrix:
[[73  0  1  0  1]
 [ 0 45  0  0  1]
 [ 1  0 55  0  0]
 [ 0  0  0 63  0]
 [ 0  0  1  0 57]]


#Predictions

In [48]:
def classify_text(text, classifier):
    # Preprocess the input text
    text = normalize_text(text)
    text = remove_stopwords(text)
    text = apply_lemmatization(text)
    text=apply_stemming(text)
    text=create_ngrams(text,1)
    text = ' '.join(text)

    # Transform the text using the trained vectorizer
    X_new = vectorizer.transform([text])

    # Predict the category using the trained classifier
    prediction = classifier.predict(X_new)

    return prediction[0]

# Example usage of the classify_text function
sample_text = """The government has introduced a series of new environmental policies aimed at reducing carbon emissions and promoting renewable energy. These measures include stricter regulations on industrial pollution and incentives for businesses that invest in green technology. The move has been praised by environmental activists but criticized by some industry leaders who argue that the new regulations could impact economic growth."""

predicted_category = classify_text(sample_text,nb)
print(f"The predicted category is: {predicted_category}")

The predicted category is: politics


