# **Packages Import**



In [None]:
import re
import nltk
import pandas as pd
from nltk.corpus                         import stopwords
from sklearn.feature_extraction.text     import TfidfVectorizer
from sklearn.model_selection             import train_test_split
from sklearn.neural_network              import MLPClassifier
from sklearn.svm                         import SVC
from sklearn.naive_bayes                 import GaussianNB
from sklearn.metrics                     import confusion_matrix,classification_report
from nltk.tokenize                       import word_tokenize
from nltk.corpus                         import stopwords
from nltk.stem                           import WordNetLemmatizer
from sklearn.preprocessing               import LabelEncoder
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#**Business & Data Understanding**

In [None]:
df=pd.read_csv("/content/bbc-text.csv")
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [None]:
df.category.value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  2225 non-null   object
 1   text      2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


#**Data Preparation**

In [None]:
features=df.iloc[:,1]
labels=df.iloc[:,0]


#**Data Cleaning**

In [None]:
tidy_features = []
for i in range(len(features)):
  if isinstance(features[i], str):  # Check if it's a string
        tmp = re.sub(r'[^a-zA-Z]', ' ', features[i])
        tmp = re.sub(r'\s[a-zA-Z]\s', ' ', tmp)
        tmp = re.sub(r'\s+', ' ', tmp)
        tmp = tmp.lower()
        tidy_features.append(tmp)


print("******** Before **********")
print(features[0:4])
print("******** After **********")
print(tidy_features[0:4])


******** Before **********
0    tv future in the hands of viewers with home th...
1    worldcom boss  left books alone  former worldc...
2    tigers wary of farrell  gamble  leicester say ...
3    yeading face newcastle in fa cup premiership s...
Name: text, dtype: object
******** After **********
['tv future in the hands of viewers with home theatre systems plasma high definition tvs and digital video recorders moving into the living room the way people watch tv will be radically different in five years time that is according to an expert panel which gathered at the annual consumer electronics show in las vegas to discuss how these new technologies will impact one of our favourite pastimes with the us leading the trend programmes and other content will be delivered to viewers via home networks through cable satellite telecoms companies and broadband service providers to front rooms and portable devices one of the most talked about technologies of ces has been digital and personal vide

***Word Embedding***

In [None]:
vectorizer=TfidfVectorizer(max_features=2000,min_df=7,max_df=0.8,stop_words=stopwords.words('english'))
X=vectorizer.fit_transform(tidy_features).toarray()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

***Data split***

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,labels,test_size=0.2)

In [None]:
gnb = GaussianNB()
linear_svm = SVC(kernel='linear')
rbf_svm = SVC(kernel='rbf')
sigmoid_svm = SVC(kernel='sigmoid')
poly_svm = SVC(kernel='poly', degree=2)
neural = MLPClassifier(hidden_layer_sizes=(100, 20), activation='logistic', solver='adam')

In [None]:
gnb.fit(X_train,y_train) # Train Guassian NB classifier
linear_svm.fit(X_train,y_train) # Train SVM
rbf_svm.fit(X_train,y_train)
sigmoid_svm.fit(X_train,y_train)
ploy_svm.fit(X_train,y_train)
neural.fit(X_train,y_train) # Train Neural Network - finding the best weight matrix

In [None]:
y_nb=gnb.predict(X_test)
y_linear_svm=linear_svm.predict(X_test)
y_rbf_svm=rbf_svm.predict(X_test)
y_ploy_svm=ploy_svm.predict(X_test)
y_sigmoid_svm=sigmoid_svm.predict(X_test)
y_neural=neural.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

# Assuming 'y_test_new' is the true labels for the new data

# Evaluate Gaussian Naive Bayes
print("Gaussian NB Classifier:")
print(classification_report(y_test, y_nb))

# Evaluate Linear SVM
print("Linear SVM:")
print(classification_report(y_test, y_linear_svm))

# Evaluate SVM with RBF kernel
print("SVM with RBF Kernel:")
print(classification_report(y_test, y_rbf_svm))

# Evaluate SVM with sigmoid kernel
print("SVM with Sigmoid Kernel:")
print(classification_report(y_test, y_sigmoid_svm))

# Evaluate SVM with polynomial kernel
print("SVM with Polynomial Kernel:")
print(classification_report(y_test, y_ploy_svm))

# Evaluate Neural Network
print("Neural Network:")
print(classification_report(y_test,y_neural))



Gaussian NB Classifier:
               precision    recall  f1-score   support

     business       0.94      0.85      0.89       112
entertainment       0.96      0.89      0.92        79
     politics       0.91      0.91      0.91        87
        sport       0.99      0.95      0.97        91
         tech       0.75      0.96      0.84        76

     accuracy                           0.91       445
    macro avg       0.91      0.91      0.91       445
 weighted avg       0.92      0.91      0.91       445

Linear SVM:
               precision    recall  f1-score   support

     business       0.97      0.95      0.96       112
entertainment       0.96      1.00      0.98        79
     politics       0.98      0.95      0.97        87
        sport       1.00      1.00      1.00        91
         tech       0.95      0.97      0.96        76

     accuracy                           0.97       445
    macro avg       0.97      0.97      0.97       445
 weighted avg       0.97

In [None]:
import pickle

iasria_vect=pickle.dump(vectorizer,open("iasria_vect.pickle",'wb'))
iasria_model=pickle.dump(neural,open("iasria_model.pickle",'wb'))

In [None]:
import pickle
import re

# Load the models
loaded_vectorizer = pickle.load(open("iasria_vect.pickle", 'rb'))
loaded_neural_model = pickle.load(open("iasria_model.pickle", 'rb'))

# Preprocess the input text
def preprocess_data(text):
    tidy_features = []
    if isinstance(text, str):  # Check if it's a string
        # Process the entire text as a single string
        tmp = re.sub(r'[^a-zA-Z]', ' ', text)
        tmp = re.sub(r'\s[a-zA-Z]\s', ' ', tmp)
        tmp = re.sub(r'\s+', ' ', tmp)
        tmp = tmp.lower()
        tidy_features.append(tmp)
    return tidy_features

# Input text
input_text = "business"

# Preprocess and vectorize the input text
processed_input = preprocess_data(input_text)
input_features = loaded_vectorizer.transform(processed_input).toarray()

# Make predictions
predictions = loaded_neural_model.predict(input_features)

# Display the result
print(f"Predicted Category: {predictions[0]}")



Predicted Category: business
