In [4]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
import os
import itertools
import matplotlib.pyplot as plt
import pickle
import pdb

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import accuracy_score

from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors

import model as m

from googletrans import Translator
translator = Translator()



############# Parameters
path2embeddings = '../pretrained_embeds/glove.6B/'
embedfile = 'glove.6B.50d'

path2data = '../data/'
datafile = 'news_articles.pkl'

cat2id_file = 'category2id.pkl'

model_saving_path = '../attention/'
model_saving_file = 'attention_model.pt'

article_max_len = 600
top_k_keywords = 10

embed_size = 50
hidden_dim = 100
n_classes = 7



############# Loading Pretrained Glove Embeddings
if os.path.isfile(path2embeddings + embedfile + '_w2v.txt'):
    glove_model = KeyedVectors.load_word2vec_format(path2embeddings + embedfile + '_w2v.txt', binary=False)
else:
    glove2word2vec(glove_input_file=path2embeddings + embedfile + '.txt', word2vec_output_file=path2embeddings + embedfile + '_w2v.txt')
    glove_model = KeyedVectors.load_word2vec_format(path2embeddings + embedfile + '_w2v.txt', binary=False)

def get_embed(word):
    # Case folding
    word = word.lower()
    try:
        return (glove_model.get_vector(word))
    except:
        return (glove_model.get_vector('<unk>'))
    
    

############## Categories to its id
if os.path.exists(path2data + cat2id_file):
    with open(path2data + cat2id_file, 'rb') as handle:
        category2id = pickle.load(handle)
# inverse the dict
id2category = {v: k for k, v in category2id.items()}
        
        
        
############ Loading the model
# Using gpu if available else cpu
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

model = m.atten_classifier(embed_size, hidden_dim, n_classes)
model = model.to(device)

if torch.cuda.is_available():
    model.load_state_dict(torch.load(model_saving_path + model_saving_file))
else:
    model.load_state_dict(torch.load(model_saving_path + model_saving_file, map_location='cpu'))

In [48]:
def translateByTokens(text, source_lang='hi'):
#     text = text.split()
    
    # pair of (src_token, dest_token)
#     trans_text = [(w, translator.translate(w, src=source_lang).text) for w in text]
    trans_text = []
    for w in text:
        try:
            trans_text.append((w, translator.translate(w, src=source_lang).text))
        except:
            pass
    
    # translated text
    eng_text = " ".join([elem[1] for elem in trans_text])
    
    # can return the cleaned the eng_text if desired
    # eng_text = clean_text(eng_text)
    return (trans_text, eng_text)

def translate(text, source_lang='hi'):
    eng_text = translator.translate(text, src=source_lang).text
    return eng_text

In [77]:
def make_pred(article, is_hindi = True):
    stop = stopwords.words('english') + list(string.punctuation)


    if is_hindi:
        article = translate(inp_article)

    article = article.lower()
    article = nltk.word_tokenize(article)
    article_words = [i for i in article if i not in stop]
    article = [get_embed(j) for j in article_words]
    article = np.array(article[:article_max_len])

    article_inp = torch.from_numpy(article).to(device)
    out, alphas = model(article_inp)

    ###### Getting the domain of news article
    domain = id2category[int(torch.argmax(out).data.cpu().numpy())]

    ####### Getting keywords with higest alpha weights
    word2weights = {}
    extracted_keywords = []
    alpha_weights = alphas.data.cpu().numpy().reshape(-1).tolist()
    for i in range(len(article_words)):
        word2weights[article_words[i]] = alpha_weights[i]
    word2weights = sorted(word2weights.items(), key=lambda kv: kv[1], reverse= True)
    for j in range(top_k_keywords):
        extracted_keywords.append(word2weights[j][0])

    if is_hindi:
        hindi_keywords = []
        for e in extracted_keywords:
            hindi_keyword = translator.translate(e, src='en', dest = 'hi').text
            hindi_keywords = hindi_keywords + hindi_keyword.split()
            
        hindistop = stopwords.words('hindi') + list(string.punctuation)
        find_hindi_keywords = [i for i in hindi_keywords if i not in hindistop]
        find_hindi_keywords = [i for i in find_hindi_keywords if len(i)>2]
        extracted_keywords = find_hindi_keywords

    return domain, extracted_keywords

In [82]:
is_hindi = False
inp_article = input('Enter the news article: ')

domain, keywords = make_pred(inp_article, is_hindi=is_hindi)

print('\nIdentified Domain: ' + str(domain) + '\n')
print('Extracted Keywords: ' + str(list(set(keywords))))

Enter the news article: Facebook, in a bid to restrict the spread of hate speech and misinformation on its platform, has banned US conspiracy theorist Alex Jones and other right far-right extremist personalities including Nation of Islam leader Louis Farrakhan, Paul Nehlen, Paul Joseph Watson, Milo Yiannopoulos and Laura Loomer from its platform. Apart from banning these leaders from Facebook, the social media giant has also banned them and their associated pages from Instagram, which in recent times had become the hotbed for extremist views and misinformation.  The strictest ban, as The Atlantic notes in its report, comes against Jones, whose Facebook profile had been suspended last July, and his InfoWars page. The Menlo Park headquartered social media giant has not only banned Jones profile and InfoWars from Facebook and from Instagram but the company has also decided to remove all the content from the two platforms that contain videos, articles and radio segments by the page. In add