In [5]:
#we compare two classifiers Multinomial NB and Logistic Regression
#first we import all the necessary packages
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB 
from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

n1 = LabelEncoder()
df = pd.read_csv('bbc-text.csv')
#print(df)

In [6]:
#we take text column in X which we will use to predict the category
#we take category column in Y which we will use to train our model as output using X
X = df['text']
X = np.array(X)
Y = df['category']
Y = np.array(Y)
#we vectorize text column using tfidf vectorizer.
#tfidf vectorizer works in two parts-tf and idf.
#tf is equal to number of occurences of the word in the document divided by total number of words present in that document
#idf is equal to log value of number of documents in which that word is present divided by total number of documents
#to read in detail about tfidf, kindly visit sklearn documentation pages
tfidf = TfidfVectorizer(lowercase=True, analyzer='word', stop_words='english', ngram_range=(1,3), use_idf=True)
X1 = tfidf.fit_transform(X)
n1 = LabelEncoder()
#we vectorize category column using labelencoder
#this will give a unique numerical value to every category
Y1 = n1.fit_transform(Y)
#we separate train and test datasets.
X_train, X_test, Y_train, Y_test = train_test_split(X1, Y1, test_size=0.3, random_state=0)
#print(X_train)
#print(Y_train)

In [7]:
#implementing multinomial NB
mnb = MultinomialNB()
mnb.fit(X_train, Y_train)
#print(mnb)
test = mnb.predict(X_test)
acc = 0
l = np.size(test, 0)
for i in range(l):
    acc = acc + abs(test[i]-Y_test[i])
print("Accuracy using Multinomial NB is: %f"%((1-acc/l)*100))

Accuracy using Multinomial NB is: 93.862275


In [8]:
#implementing Logistic Regression
lr = LogisticRegression(solver='lbfgs', multi_class='multinomial')
lr.fit(X_train, Y_train)
#print(mnb)
test = lr.predict(X_test)
acc = 0
l = np.size(test, 0)
for i in range(l):
    acc = acc + abs(test[i]-Y_test[i])
print("Accuracy using Logistic Regression is: %f"%((1-acc/l)*100))

KeyboardInterrupt: 

In [13]:
import pickle

save_label = open("label_encoder.pickle", "wb")
pickle.dump(n1, save_label)
save_label.close()

save_label = open("label_encoder.pickle", "rb")
n1 = pickle.load(save_label)
save_label.close()

In [None]:
import pickle

save_tdidf = open("tdidf.pkl", "wb")
pickle.dump(tfidf, save_tdidf)
save_tdidf.close()

save_tdidf = open("tdidf.pkl", "rb")
tdidf = pickle.load(save_tdidf)
save_tdidf.close()

In [6]:
#now we will implement pickle
#we are using pickle to save our models
import pickle

save_classifier = open("naivebayes.pickle","wb")
pickle.dump(mnb, save_classifier)
save_classifier.close()

save_classifier = open("logistic.pickle","wb")
pickle.dump(lr, save_classifier)
save_classifier.close()
#here we saved both our models in naivebayes.pickle and logistic.pickle

In [2]:
#here we load naivebayes.pickle and logistic.pickle and use them
import pickle

classifier_f = open("naivebayes.pickle", "rb")
clf1 = pickle.load(classifier_f)
classifier_f.close()

classifier_f = open("logistic.pickle", "rb")
clf2 = pickle.load(classifier_f)
classifier_f.close()

In [12]:
import numpy as np

t = "Inmates in a Sri Lanka prison scuffled with guards and some tried to escape during a protest against new strict measures to control the spread of the coronavirus, leaving two dead and six wounded, police said Sunday."
t = np.array(t).reshape(-1,1)
t = tfidf.transform(t[0])
#print(t)
print(n1.inverse_transform(clf1.predict(t))[0])
print(n1.inverse_transform(clf2.predict(t))[0])

business
business
