In [1]:
!pip install tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.8.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 5.1 MB/s 
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 67.8 MB/s 
Installing collected packages: tf-estimator-nightly, tensorflow-text
Successfully installed tensorflow-text-2.8.1 tf-estimator-nightly-2.8.0.dev2021122109


In [43]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline

In [44]:
df = pd.read_csv('english-train.csv')
df.head()

Unnamed: 0,headline,Label
0,Former New Zealand cricketer Chris Cairns diag...,Sports
1,American skater Nathan Chen dazzles in his Oly...,Sports
2,La Liga: Enes Unal scores brace to lead Getafe...,Sports
3,U-19 World Cup: Australia beat Afghanistan to ...,Sports
4,ICC U-19 World Cup India vs England final Live...,Sports


In [45]:
def count_length():
    df['word_count'] = df['headline'].apply(lambda x: len(str(x).split(" ")))

In [46]:
count_length()

In [47]:
df.head(10)

Unnamed: 0,headline,Label,word_count
0,Former New Zealand cricketer Chris Cairns diag...,Sports,10
1,American skater Nathan Chen dazzles in his Oly...,Sports,9
2,La Liga: Enes Unal scores brace to lead Getafe...,Sports,13
3,U-19 World Cup: Australia beat Afghanistan to ...,Sports,10
4,ICC U-19 World Cup India vs England final Live...,Sports,15
5,Brainsqueeze: Know your Snow,Sports,4
6,Justin Langer steps down as Australia coach,Sports,7
7,From Ratnakar Shetty’s memoirs: Apparently Vir...,Sports,16
8,I think ODI revolution happened with 1996 Worl...,Sports,15
9,U-19 World Cup: Boxer’s son Nishant Sindhu wit...,Sports,11


In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [48]:
df['label_id'] = df.Label.factorize()[0]
from nltk.corpus import stopwords
import nltk
import re
def preprocess_text(sen,flg_lemm=True, lst_stopwords=stopwords.words('english')):
    # Lowercase
    sentence = sen.lower()
    
    # Remove punctuations and numbers
    sentence = re.sub('[0-9]', ' ', sentence)
    sentence = re.sub(r'[^\w\s]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\b[a-zA-Z]\b", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    lst_text = sentence.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)

    return text
 

In [49]:
df['headline'] = df.headline.apply(lambda x: preprocess_text(x))
count_length()

In [50]:
df.head(10)

Unnamed: 0,headline,Label,word_count,label_id
0,former new zealand cricketer chris cairn diagn...,Sports,9,0
1,american skater nathan chen dazzle olympic return,Sports,7,0
2,la liga ene unal score brace lead getafe win l...,Sports,10,0
3,world cup australia beat afghanistan claim rd ...,Sports,8,0
4,icc world cup india v england final live strea...,Sports,10,0
5,brainsqueeze know snow,Sports,3,0
6,justin langer step australia coach,Sports,5,0
7,ratnakar shetty memoir apparently virat unhapp...,Sports,9,0
8,think odi revolution happened world cup tendul...,Sports,10,0
9,world cup boxer son nishant sindhu deadly left,Sports,8,0


In [51]:
num_classes = len(df["label_id"].value_counts())

In [52]:
num_classes

5

In [53]:
y = tf.keras.utils.to_categorical(df["label_id"].values, num_classes=num_classes)

X_train, X_test, Y_train, Y_test = train_test_split(df['headline'], y, test_size=0.25, random_state=42)

In [54]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2")
bert_encoder = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-base/1")

In [55]:
i = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
x = bert_preprocess(i)
x = bert_encoder(x)
x = tf.keras.layers.Dropout(0.2, name="dropout")(x['pooled_output'])
x = tf.keras.layers.Dense(num_classes, activation='softmax', name="output")(x)

model = tf.keras.Model(i, x)

In [56]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [57]:
model.fit(X_train, Y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f00d8553110>

In [58]:
accr = model.evaluate(X_test, Y_test)



In [59]:
print('Accuracy: {:0.2f}'.format(accr[1]))

Accuracy: 0.89


In [60]:
model.save('EN_BERT.h5')

In [61]:
test_classes = []
for i in range(len(Y_test)):
    for j in range(len(Y_test[i])):
        if Y_test[i][j] == 1:
            if j==0:
                test_classes.append(0)
            if j==1:
                test_classes.append(1)
            if j==2:
                test_classes.append(2)
            if j==3:
                test_classes.append(3)
            if j==4:
                test_classes.append(4)

In [62]:
predicted_categories = []
preds = model.predict(X_test)
for i in preds:
    pred_classes = np.argsort(i)[-1:][::-1]
    predicted_categories.append(pred_classes[0])

In [63]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
report = classification_report(test_classes, predicted_categories)
print(report)

              precision    recall  f1-score   support

           0       0.88      0.87      0.87      1834
           1       0.93      0.88      0.90      1791
           2       0.88      0.87      0.87      1946
           3       0.87      0.90      0.89      1978
           4       0.89      0.90      0.89      1800

    accuracy                           0.89      9349
   macro avg       0.89      0.89      0.89      9349
weighted avg       0.89      0.89      0.89      9349



In [64]:
mat = confusion_matrix(test_classes, predicted_categories)
mat

array([[1603,   13,   31,   83,  104],
       [  36, 1583,  115,   27,   30],
       [  47,   89, 1698,   95,   17],
       [  53,    7,   76, 1788,   54],
       [  92,   17,   18,   61, 1612]])

In [65]:
id_to_category = {0:'sports', 1:'tech', 2:'business', 3:'politics',
                  4:'entertainment'}

In [66]:
my_input = ["BJP Leader Beats Manohar Parrikar's Son In Panaji. Why He Isn't Happy"]
preds = model.predict(my_input)[0]
pred_classes = np.argsort(preds)[-1:][::-1]

classes = [id_to_category[i] for i in pred_classes]
props   = preds[pred_classes]

for c, p in zip(classes, props):
    print("{} {:.2f} %".format(c,p*100))

politics 98.53 %


In [67]:
my_input2 =["Big hit on India: ‘Higher oil, food prices; duty rollback may help"]


preds = model.predict(my_input2)[0]

pred_classes = np.argsort(preds)[-1:][::-1]

classes = [id_to_category[i] for i in pred_classes]
props   = preds[pred_classes]

result2 = {}
for c, p in zip(classes, props):
    #result.append("{} {:.2f} %".format(c,p*100))
    result2[c] = round(p*100,2)
result2

{'business': 99.54}

In [68]:
my_input3 =["Anushka Sharma dresses daughter Vamika as a fairy for Halloween, Soha Ali Khan turns Inaaya into a unicorn"]

preds = model.predict(my_input3)[0]

pred_classes = np.argsort(preds)[-1:][::-1]

classes = [id_to_category[i] for i in pred_classes]
props   = preds[pred_classes]

result3 = {}
for c, p in zip(classes, props):
    #result.append("{} {:.2f} %".format(c,p*100))
    result3[c] = round(p*100,2)
result3

{'entertainment': 99.61}

In [69]:
my_input4 =["Apple is expected to launch the iPhone SE 3 later this year."]

preds = model.predict(my_input4)[0]

pred_classes = np.argsort(preds)[-1:][::-1]

classes = [id_to_category[i] for i in pred_classes]
props   = preds[pred_classes]

result4 = {}
for c, p in zip(classes, props):
    #result.append("{} {:.2f} %".format(c,p*100))
    result4[c] = round(p*100,2)
result4

{'tech': 100.0}

In [73]:
my_input5 =["2nd Test: South Africa 278/5 on day one after Bangladesh's late strikes"]

preds = model.predict(my_input5)[0]

pred_classes = np.argsort(preds)[-1:][::-1]

classes = [id_to_category[i] for i in pred_classes]
props   = preds[pred_classes]

result5 = {}
for c, p in zip(classes, props):
    #result.append("{} {:.2f} %".format(c,p*100))
    result5[c] = round(p*100,2)
result5

{'sports': 99.94}