In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv("data_processed.csv")

In [3]:
df.head()

Unnamed: 0,ArticleId,Text,Category,Text_processed,Category_id
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom ex bos launch defence lawyer defendin...,0
1,154,german business confidence slides german busin...,business,german business confidence slide german busine...,0
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizen majo...,0
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster better ...,4
4,917,enron bosses in $168m payout eighteen former e...,business,enron boss payout eighteen former enron direct...,0


In [4]:
tfidf = TfidfVectorizer(max_features=1000, max_df= 0.5, ngram_range= (1, 2))

In [5]:
X_trn = tfidf.fit_transform(df.Text_processed).toarray()

In [6]:
X_trn.shape

(1490, 1000)

In [7]:
df['Category'].unique()

array(['business', 'tech', 'politics', 'sport', 'entertainment'],
      dtype=object)

In [8]:
from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
df['Category_id']= label_encoder.fit_transform(df['Category'])
  
df['Category_id'].unique()

array([0, 4, 2, 3, 1])

In [9]:
label_encoder.inverse_transform([4])

array(['tech'], dtype=object)

In [10]:
df.head()

Unnamed: 0,ArticleId,Text,Category,Text_processed,Category_id
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom ex bos launch defence lawyer defendin...,0
1,154,german business confidence slides german busin...,business,german business confidence slide german busine...,0
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizen majo...,0
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster better ...,4
4,917,enron bosses in $168m payout eighteen former e...,business,enron boss payout eighteen former enron direct...,0


In [11]:
X_train,X_test,y_train,y_test = train_test_split(X_trn,
                                                 df['Category_id'],
                                                test_size=0.20,
                                                random_state=42)

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1192, 1000)
(298, 1000)
(1192,)
(298,)


In [13]:
lr_model  = LogisticRegression( C =  7.742636826811269,
                               penalty= 'l2', 
                               solver= 'newton-cg')

In [14]:
lr_model.fit(X_train,y_train)

In [15]:
y_pred = lr_model.predict(X_test)

In [16]:
test_accuracy  = accuracy_score(y_test, y_pred)

In [17]:
print(' Test Accuracy: {} '.format( test_accuracy))

 Test Accuracy: 0.9731543624161074 


In [18]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98        75
           1       0.98      1.00      0.99        46
           2       0.95      0.95      0.95        56
           3       0.97      1.00      0.98        63
           4       1.00      0.93      0.96        58

    accuracy                           0.97       298
   macro avg       0.97      0.97      0.97       298
weighted avg       0.97      0.97      0.97       298



In [19]:
confusion_matrix = confusion_matrix(y_test, y_pred)

In [20]:
confusion_matrix

array([[74,  0,  1,  0,  0],
       [ 0, 46,  0,  0,  0],
       [ 2,  0, 53,  1,  0],
       [ 0,  0,  0, 63,  0],
       [ 0,  1,  2,  1, 54]], dtype=int64)

In [21]:
pd.DataFrame(
    confusion_matrix,
    index = [['actual', 'actual', 'actual', 'actual', 'actual'], ['business', 'entertainment', 'politics', 'sport', 'tech' ]],
    columns = [['predicted', 'predicted', 'predicted', 'predicted', 'predicted'],['business', 'entertainment', 'politics', 'sport', 'tech' ]])

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted,predicted,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,business,entertainment,politics,sport,tech
actual,business,74,0,1,0,0
actual,entertainment,0,46,0,0,0
actual,politics,2,0,53,1,0
actual,sport,0,0,0,63,0
actual,tech,0,1,2,1,54


In [22]:
#  Save tfidf vect

file = open("tfidf.pkl","wb")
joblib.dump(tfidf,file)
file.close()


In [23]:
# Save Model

model_file = open("lr_model.pkl","wb")
joblib.dump(lr_model,model_file)
model_file.close()
 

In [24]:
# Save enoder

en_file = open("label_encoder.pkl","wb")
joblib.dump(label_encoder,en_file)
en_file.close()

# Test Model

In [32]:
test_case = ["Vijay Deverakonda, who is busy promoting his upcoming film, ‘Liger’, recently revealed that he wears a burqa when he goes to watch his films in theatres to remain anonymous at the venue.In his interaction with a news portal, the actor narrated an incident that happened in 2019 when he went to watch his Telugu movie, ‘Dear Comrade’. The actor revealed that he wears a burqa and goes into theatres and sits amongst the audience and watches his films. He also jokingly added that if you see a burqa-clad tall person around you, it could probably be him."]

In [33]:
tf = joblib.load("tfidf.pkl")
model = joblib.load("lr_model.pkl")
encode = joblib.load("label_encoder.pkl")

In [34]:
tf

In [36]:
text = tf.transform(test_case)

In [37]:
text

<1x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [38]:
pred = model.predict(text)

In [39]:
pred

array([1])

In [41]:
category = encode.inverse_transform(pred)

In [43]:
category[0]

'entertainment'

In [61]:
predict_prob = model.predict_proba(text)[0]

In [62]:
predict_prob

array([0.02245579, 0.81992861, 0.04190416, 0.03756432, 0.07814712])

In [63]:
round(predict_prob.max() * 100)

82