In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline

In [2]:
df = pd.read_csv('english-train.csv')
df.head()

Unnamed: 0,headline,Label
0,Former New Zealand cricketer Chris Cairns diag...,Sports
1,American skater Nathan Chen dazzles in his Oly...,Sports
2,La Liga: Enes Unal scores brace to lead Getafe...,Sports
3,U-19 World Cup: Australia beat Afghanistan to ...,Sports
4,ICC U-19 World Cup India vs England final Live...,Sports


In [3]:
def count_length():
    df['word_count'] = df['headline'].apply(lambda x: len(str(x).split(" ")))

In [4]:
count_length()

In [5]:
df.head(10)

Unnamed: 0,headline,Label,word_count
0,Former New Zealand cricketer Chris Cairns diag...,Sports,10
1,American skater Nathan Chen dazzles in his Oly...,Sports,9
2,La Liga: Enes Unal scores brace to lead Getafe...,Sports,13
3,U-19 World Cup: Australia beat Afghanistan to ...,Sports,10
4,ICC U-19 World Cup India vs England final Live...,Sports,15
5,Brainsqueeze: Know your Snow,Sports,4
6,Justin Langer steps down as Australia coach,Sports,7
7,From Ratnakar Shetty’s memoirs: Apparently Vir...,Sports,16
8,I think ODI revolution happened with 1996 Worl...,Sports,15
9,U-19 World Cup: Boxer’s son Nishant Sindhu wit...,Sports,11


In [6]:
df['label_id'] = df.Label.factorize()[0]
from nltk.corpus import stopwords
import nltk
import re
def preprocess_text(sen,flg_lemm=True, lst_stopwords=stopwords.words('english')):
    # Lowercase
    sentence = sen.lower()
    
    # Remove punctuations and numbers
    sentence = re.sub('[0-9]', ' ', sentence)
    sentence = re.sub(r'[^\w\s]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\b[a-zA-Z]\b", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    lst_text = sentence.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)

    return text
 

In [7]:
df['headline'] = df.headline.apply(lambda x: preprocess_text(x))
count_length()

In [8]:
df.head(10)

Unnamed: 0,headline,Label,word_count,label_id
0,former new zealand cricketer chris cairn diagn...,Sports,9,0
1,american skater nathan chen dazzle olympic return,Sports,7,0
2,la liga ene unal score brace lead getafe win l...,Sports,10,0
3,world cup australia beat afghanistan claim rd ...,Sports,8,0
4,icc world cup india v england final live strea...,Sports,10,0
5,brainsqueeze know snow,Sports,3,0
6,justin langer step australia coach,Sports,5,0
7,ratnakar shetty memoir apparently virat unhapp...,Sports,9,0
8,think odi revolution happened world cup tendul...,Sports,10,0
9,world cup boxer son nishant sindhu deadly left,Sports,8,0


In [9]:
X = df['headline'].values
Y = df['label_id']
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)

In [10]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.20)
max_words = 10000
max_len = 200
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(7,name='out_layer')(layer)
    layer = Activation('softmax')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model
model = RNN()

In [11]:
test_classes = []
for i in range(len(Y_test)):
    for j in range(len(Y_test[i])):
        test_classes.append(Y_test[i][j])

In [12]:
model.compile(loss='sparse_categorical_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])
history = model.fit(sequences_matrix,Y_train,batch_size=256,epochs=10,
          validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)
accr = model.evaluate(test_sequences_matrix,Y_test)
print('Accuracy: {:0.5f}'.format(accr[1]))

Accuracy: 0.93636


In [14]:
model.save('eng_lstm.h5')

In [15]:
predicted_categories = []
preds = model.predict(test_sequences_matrix)
for i in preds:
    pred_classes = np.argsort(i)[-1:][::-1]
    predicted_categories.append(pred_classes[0])

In [16]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
report = classification_report(test_classes, predicted_categories)
print(report)

              precision    recall  f1-score   support

           0       0.90      0.97      0.93      1471
           1       0.97      0.87      0.92      1453
           2       0.92      0.94      0.93      1569
           3       0.96      0.94      0.95      1516
           4       0.94      0.96      0.95      1470

    accuracy                           0.94      7479
   macro avg       0.94      0.94      0.94      7479
weighted avg       0.94      0.94      0.94      7479



In [17]:
mat = confusion_matrix(test_classes, predicted_categories)
mat

array([[1425,    2,    7,    9,   28],
       [  42, 1265,   86,   15,   45],
       [  42,   28, 1477,   19,    3],
       [  37,    3,   37, 1419,   20],
       [  37,    2,    3,   11, 1417]], dtype=int64)

In [18]:
id_to_category = {0:'sports', 1:'tech', 2:'business', 3:'politics',
                  4:'entertainment'}

In [19]:
my_input = ["BJP Leader Beats Manohar Parrikar's Son In Panaji. Why He Isn't Happy"]

txts = tok.texts_to_sequences(my_input)
txts = sequence.pad_sequences(txts, maxlen=max_len)
preds = model.predict(txts)[0]

pred_classes = np.argsort(preds)[-1:][::-1]

classes = [id_to_category[i] for i in pred_classes]
props   = preds[pred_classes]

for c, p in zip(classes, props):
    print("{} {:.2f} %".format(c,p*100))

politics 100.00 %


In [21]:
my_input2 =["Big hit on India: ‘Higher oil, food prices; duty rollback may help"]

txts = tok.texts_to_sequences(my_input2)
txts = sequence.pad_sequences(txts, maxlen=max_len)
preds = model.predict(txts)[0]

pred_classes = np.argsort(preds)[-1:][::-1]

classes = [id_to_category[i] for i in pred_classes]
props   = preds[pred_classes]

result2 = {}
for c, p in zip(classes, props):
    #result.append("{} {:.2f} %".format(c,p*100))
    result2[c] = round(p*100,2)
result2

{'business': 99.22}

In [22]:
my_input3 =["Anushka Sharma dresses daughter Vamika as a fairy for Halloween, Soha Ali Khan turns Inaaya into a unicorn"]

txts = tok.texts_to_sequences(my_input3)
txts = sequence.pad_sequences(txts, maxlen=max_len)
preds = model.predict(txts)[0]

pred_classes = np.argsort(preds)[-1:][::-1]

classes = [id_to_category[i] for i in pred_classes]
props   = preds[pred_classes]

result3 = {}
for c, p in zip(classes, props):
    #result.append("{} {:.2f} %".format(c,p*100))
    result3[c] = round(p*100,2)
result3

{'entertainment': 100.0}

In [23]:
my_input4 =["Apple is expected to launch the iPhone SE 3 later this year."]

txts = tok.texts_to_sequences(my_input4)
txts = sequence.pad_sequences(txts, maxlen=max_len)
preds = model.predict(txts)[0]

pred_classes = np.argsort(preds)[-1:][::-1]

classes = [id_to_category[i] for i in pred_classes]
props   = preds[pred_classes]

result4 = {}
for c, p in zip(classes, props):
    #result.append("{} {:.2f} %".format(c,p*100))
    result4[c] = round(p*100,2)
result4

{'tech': 99.97}

In [25]:
my_input5 =["Strandja Memorial Boxing: Nandini ends with bronze after semifinal loss"]

txts = tok.texts_to_sequences(my_input5)
txts = sequence.pad_sequences(txts, maxlen=max_len)
preds = model.predict(txts)[0]

pred_classes = np.argsort(preds)[-1:][::-1]

classes = [id_to_category[i] for i in pred_classes]
props   = preds[pred_classes]

result5 = {}
for c, p in zip(classes, props):
    #result.append("{} {:.2f} %".format(c,p*100))
    result5[c] = round(p*100,2)
result5

{'sports': 100.0}