## imports

In [1]:
import json
import random

import re
import string
import os.path
import timeit
import itertools
from itertools import chain

import pickle
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

from keras.models import Sequential, load_model, Model
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, Embedding, Flatten, LSTM, Bidirectional, TimeDistributed, Dropout, Activation

%matplotlib inline
%config InlineBackend.figure_formats = ['svg']

from googletrans import Translator
translator = Translator()

Using TensorFlow backend.


## translation

In [2]:
text = "नौसेना के वाइस ऐडमिरल बिमल वर्मा अगले नौसेना प्रमुख के तौर पर अपनी नियुक्ति ना होने को लेकर आर्म्ड फोर्सेस ट्रिब्युनल पहुंच गए हैं।\
        केंद्र ने 23 मार्च को वर्मा से जूनियर रहे वाइस ऐडमिरल करमबीर सिंह को अगले नौसेना प्रमुख के तौर पर नियुक्त किया था जो 31 मई \
        को रिटायर हो रहे ऐडमिरल सुनील लांबा की जगह लेंगे।"

translated = translator.translate(text, src='hi')

In [None]:
dir(translated)

In [3]:
translated.text

'Naval Vice Admiral Bimal Verma has reached Armed Forces Tribunal for not being appointed as the next Naval Chief. On March 23, the center had appointed Vice Admiral Karmibir Singh, Jr. from Verma as the next Navy Chief, who will replace Admiral Sunil Lamba, who retires on May 31.'

In [13]:
def translateByTokens(text, source_lang='hi'):
    text = text.split()
    
    # pair of (src_token, dest_token)
    trans_text = [(w, translator.translate(w, src=source_lang).text) for w in text]
    
    # translated text
    eng_text = " ".join([elem[1] for elem in trans_text])
    
    # can return the cleaned the eng_text if desired
    # eng_text = clean_text(eng_text)
    return (trans_text, eng_text)
    
    
def translate(text, source_lang='hi'):
    eng_text = translator.translate(text, src=source_lang).text
    return eng_text

        

In [11]:
translateByTokens(text)

([('नौसेना', 'Navy'),
  ('के', 'Of'),
  ('वाइस', 'Vice'),
  ('ऐडमिरल', 'Admiral'),
  ('बिमल', 'Bimal'),
  ('वर्मा', 'Verma'),
  ('अगले', 'Next'),
  ('नौसेना', 'Navy'),
  ('प्रमुख', 'Chief'),
  ('के', 'Of'),
  ('तौर', 'Modus operandi'),
  ('पर', 'On'),
  ('अपनी', 'mine'),
  ('नियुक्ति', 'Appointment'),
  ('ना', 'No'),
  ('होने', 'Having'),
  ('को', 'To'),
  ('लेकर', 'By taking'),
  ('आर्म्ड', 'armed'),
  ('फोर्सेस', 'Forces'),
  ('ट्रिब्युनल', 'Tribunal'),
  ('पहुंच', 'the access'),
  ('गए', 'went'),
  ('हैं।', 'Are.'),
  ('केंद्र', 'center'),
  ('ने', 'has'),
  ('23', '23'),
  ('मार्च', 'March'),
  ('को', 'To'),
  ('वर्मा', 'Verma'),
  ('से', 'From'),
  ('जूनियर', 'Junior'),
  ('रहे', 'are'),
  ('वाइस', 'Vice'),
  ('ऐडमिरल', 'Admiral'),
  ('करमबीर', 'Karambir'),
  ('सिंह', 'Lion'),
  ('को', 'To'),
  ('अगले', 'Next'),
  ('नौसेना', 'Navy'),
  ('प्रमुख', 'Chief'),
  ('के', 'Of'),
  ('तौर', 'Modus operandi'),
  ('पर', 'On'),
  ('नियुक्त', 'Appointed'),
  ('किया', 'Did'),
  ('था', 'Was'),
 

In [14]:
translate(text)

'Naval Vice Admiral Bimal Verma has reached Armed Forces Tribunal for not being appointed as the next Naval Chief. On March 23, the center had appointed Vice Admiral Karmibir Singh, Jr. from Verma as the next Navy Chief, who will replace Admiral Sunil Lamba, who retires on May 31.'

## model

In [33]:
# creating model 
ip = Input(shape=(MAX_SENT_LEN,))

model = Embedding(input_dim = VOCAB_SIZE, output_dim = 50, input_length = MAX_SENT_LEN)(ip)
model = Dropout(0.2)(model)

model = Bidirectional(LSTM(units=100, dropout=0.2, recurrent_dropout=0.2))(model)
out = Dense(NUM_CLASSES, activation="softmax")(model)  

model = Model(ip, out)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])


In [34]:
model = load_model("modelsV1/val_acc-improvement-04-0.9282.hdf5")

## tokenizing

In [35]:
# load pickle

dict_file = open("data-pickle.txt","rb")
dictionary = pickle.load(dict_file)
dict_file.close()

dictionary.keys()

dict_keys(['word2idx', 'class2idx', 'VOCAB_SIZE', 'NUM_CLASSES', 'MAX_SENT_LEN'])

In [18]:
word2idx = dictionary['word2idx']
class2idx = dictionary['class2idx']
MAX_SENT_LEN = dictionary['MAX_SENT_LEN']
VOCAB_SIZE = dictionary['VOCAB_SIZE']
NUM_CLASSES = dictionary['NUM_CLASSES']

classes = ['automobile', 'business', 'entertainment', 'politics', 'science', 'sports', 'technology']

In [42]:
# removing stop words and punctiuation
# converting to set to improve search
STOPWORDS = set(stopwords.words('english')) 

def clean_text(text):
    
    #split based on everything except a-z0-9_'.-
    
    # remove .
    text = re.sub("[.]", "", text)
    tokens = re.findall("[a-z0-9_'.\-]+", text.lower())
    tokens = [w for w in tokens if not w in STOPWORDS and len(w) > 2 and len(w)<20]
    text = " ".join(tokens)
    
    return text


def articleToVec(article):
    article_vec = []
    
    for w in clean_text(article).split():    
        try:
            article_vec.append(word2idx[w])
        
        except KeyError as e:
            article_vec.append(word2idx["UNK"])
    
    # padding article_vec
    # put [article_vec] as pad sequences needs list of list 
    article_vec = pad_sequences([article_vec], maxlen = MAX_SENT_LEN, padding = "post", value = word2idx['ENDPAD'], truncating = "post")
    
    # coz pad_sequences will return list of list
    return np.array(article_vec[0])


def make_prediction(article="", true_category="", needTranslation=False):
    if not article:
        article = input("Enter article:")
        true_category = input("\nEnter true category:")
    
    if needTranslation:
        article = translate(article)
        
    article_vec = articleToVec(article)
    pred_score = model.predict([[article_vec]])
    
    print("\n***Article Snippet: \n", article[:500], )
    print("\n-->Scores:", pred_score, pred_score.argmax(axis=1))
    print("\n-->Predicted Category:", classes[pred_score.argmax(axis=1)[0]])
    print("-->Actual Category:", true_category)


In [19]:
word2idx["UNK"]

157875

In [23]:
article = 'lok sabha elections 2019- limit lies tejashwi yadav tweet attack modi bihar rallies prime minister modi address two back-to-back rallies gaya jamui witnessed several violent maoist incidents recent past tuesday. bihar chief minister janata dal united chief nitish kumar also campaign lok sabha election gaya prime minister. lok sabha elections updated apr 2019 ist correspondent hindustan times new delhi rashtriya janata dal leader tejashwi yadav attacked prime minister narendra modi ahead lok sabha election rallies bihar gaya jamui pulling promises made state last general polls. file photo rashtriya janata dal leader tejashwi yadav attacked prime minister narendra modi ahead lok sabha election rallies bihar gaya jamui pulling promises made state last general polls. prime minister modi address two back-to-back rallies gaya jamui witnessed several violent maoist incidents recent past tuesday. bihar chief minister janata dal united chief nitish kumar also campaign lok sabha election gaya prime minister. modi coming gaya again. prime minister narendramodi listen speech gave last time gaya carefully. sure embarrassed. one exaggerate much limit lies prime minister tejashwi yadav tweeted hindi tuesday. dear narendramodi reminding promised bihar bihari 10-03-2014 purnea. promised give bihar- special status special package special attention visiting bihar befool bihari first see mirror answer thyself wrote. bihar opposition alliance announced seat-sharing deal lok sabha elections rjd senior partner contest state lok sabha seats. congress party fight nine arrangement. former chief minister jitan ram manjhi contest gaya. bjp fielded candidate either gaya jamui constituencies vote april 11. bjp ally fielded vijay kumar manjhi gaya lok janshakti party ljp chirag paswan contest jamui. polling eastern state held seven phases lok sabha elections scheduled begin april end may 19. results declared may 23. first published apr 2019 ist tags narendra modi lok sabha election tejashwi yadav'
article

'lok sabha elections 2019- limit lies tejashwi yadav tweet attack modi bihar rallies prime minister modi address two back-to-back rallies gaya jamui witnessed several violent maoist incidents recent past tuesday. bihar chief minister janata dal united chief nitish kumar also campaign lok sabha election gaya prime minister. lok sabha elections updated apr 2019 ist correspondent hindustan times new delhi rashtriya janata dal leader tejashwi yadav attacked prime minister narendra modi ahead lok sabha election rallies bihar gaya jamui pulling promises made state last general polls. file photo rashtriya janata dal leader tejashwi yadav attacked prime minister narendra modi ahead lok sabha election rallies bihar gaya jamui pulling promises made state last general polls. prime minister modi address two back-to-back rallies gaya jamui witnessed several violent maoist incidents recent past tuesday. bihar chief minister janata dal united chief nitish kumar also campaign lok sabha election gaya p

In [24]:
articleToVec(article)

array([154071,  84135,  15537,  43376,  83430, 152589,  12512,  72245,
        96837, 114205,  22241,  73962,  78774, 153024,   4708,  22241,
       117410,   9271,  68140,  78774, 106745,  58921,  54856,   1806,
        71611, 115310,  66516,  70176,  56770, 113218,  73962,  32319,
         4708,    359, 102343,   3314,  32319,   2781, 112564,  84198,
        92741, 154071,  84135,  11651, 106745, 153024,   4708, 154071,
        84135,  15537, 134369, 130331,  52456,  99235, 105215,   1517,
        12336, 122739, 155841, 102968,    359, 102343, 105030,  12512,
        72245,   4019, 153024,   4708,  11272,  22241, 112386, 154071,
        84135,  11651,  78774,  73962, 106745,  58921,  92456, 125199,
       114604, 108341, 105131, 132678, 103872, 100090, 113053, 102968,
          359, 102343, 105030,  12512,  72245,   4019, 153024,   4708,
        11272,  22241, 112386, 154071,  84135,  11651,  78774,  73962,
       106745,  58921,  92456, 125199, 114604, 108341, 105131, 132678,
      

## testing

In [27]:
make_prediction(article, true_category="politics")

**Article Snippet: 
 lok sabha elections 2019- limit lies tejashwi yadav tweet attack modi bihar rallies prime minister modi address two back-to-back rallies gaya jamui witnessed several violent maoist incidents recent pa

-->Scores: [[3.3693539e-04 3.4328463e-04 3.4190243e-04 9.9885726e-01 2.0349758e-05
  8.8855530e-05 1.1389245e-05]] [3]

-->Predicted Category: politics
-->Actual Category: politics


In [41]:
make_prediction()

Enter article:KXIP captain Ravichandran Ashwin has said he is the best spinner in IPL. "It's my 11th season of IPL, so I'd like to think I have done a fairly good job. I have never shied away from competing with anyone. I am right on top of the pile," added Ashwin, who has taken 14 wickets in 12 IPL 2019 matches.

Enter true category:sports

***Article Snippet: 
 KXIP captain Ravichandran Ashwin has said he is the best spinner in IPL. "It's my 11th season of IPL, so I'd like to think I have done a fairly good job. I have never shied away from competing with anyone. I am right on top of the pile," added Ashwin, who has taken 14 wickets in 12 IPL 2019 matches.

-->Scores: [[4.8535994e-06 2.7143302e-05 5.9570652e-04 7.1943272e-05 1.0966123e-03
  9.9818319e-01 2.0410595e-05]] [5]

-->Predicted Category: sports
-->Actual Category: sports


In [43]:
make_prediction(needTranslation=True)

Enter article:चुनाव आयोग ने उत्तर प्रदेश के मुख्यमंत्री योगी आदित्यनाथ द्वारा संभल लोकसभा सीट से सपा प्रत्याशी शफीकुर रहमान बर्क को 'बाबर की औलाद' कहने पर उनके खिलाफ नोटिस जारी किया है। आयोग ने उन्हें जवाब देने के लिए 24 घंटे का समय दिया है। योगी ने शफीकुर को लेकर कहा था, "बाबर की औलाद को देश सौंपना चाहते हो क्या?"

Enter true category:politics

***Article Snippet: 
 The Election Commission has issued notice to the Shafiqur Rehman Burke, the SP candidate from the Sambhal Lok Sabha constituency, Uttar Pradesh's Yogi Adityanath, against him when he called Babar's childhood. The Commission has given 24 hours to answer them. The Yogi had said about Shafiqur, "Do you want to hand over Babar's blessing to the nation?"

-->Scores: [[5.2128395e-04 1.1115491e-03 1.9068865e-03 9.9594158e-01 5.7914742e-05
  4.1967226e-04 4.1089213e-05]] [3]

-->Predicted Category: politics
-->Actual Category: politics
