# Topic Classification

### Load Word2Vec model and Other Functions

In [3]:
import gensim, operator
from scipy import spatial
import numpy as np
from gensim.models import KeyedVectors

model_path = '/github/'

In [4]:
def load_wordvec_model(modelName, modelFile, flagBin):
    print('Loading ' + modelName + ' model...')
    model = KeyedVectors.load_word2vec_format(model_path + modelFile, binary=flagBin)
    print('Finished loading ' + modelName + ' model...')
    return model

model_word2vec = load_wordvec_model('Word2Vec', 'GoogleNews-vectors-negative300.bin.gz', True)
#model_fasttext = load_wordvec_model('FastText', 'fastText_wiki_en.vec', False)

Loading Word2Vec model...
Finished loading Word2Vec model...


In [5]:
def vec_similarity(input1, input2, vectors):
    term_vectors = [np.zeros(300), np.zeros(300)]
    terms = [input1, input2]
        
    for index, term in enumerate(terms):
        for i, t in enumerate(term.split(' ')):
            try:
                term_vectors[index] += vectors[t]
            except:
                term_vectors[index] += 0
        
    result = (1 - spatial.distance.cosine(term_vectors[0], term_vectors[1]))
    if result is 'nan':
        result = 0
        
    return result

In [6]:
# function checks whether the input words are present in the vocabulary for the model
def vocab_check(vectors, words):
    
    output = list()
    for word in words:
        if word in vectors.vocab:
            output.append(word.strip())
            
    return output

In [7]:
# function calculates similarity between two strings using a particular word vector model
def calc_similarity(input1, input2, vectors):
    s1words = set(vocab_check(vectors, input1.split()))
    s2words = set(vocab_check(vectors, input2.split()))
    
    try:
        output = vectors.n_similarity(s1words, s2words)
    except:
        output = 0
    return output

### Data Exploration with Sematic Similartity

In [8]:
# Load json data into list of dictionaries
import json
google_json=open("/github/google_deduplicated.json").readlines()

In [9]:
# Prints the number of newsfeeds (JSON objects) in the collection
newsfeeds_read = []
for line in google_json:
    newsfeeds_read.append(json.loads(line))

In [10]:
# Create randome title list 
import random
title_list = [x['title'] for x in newsfeeds_read]
article_title = random.choice(title_list)

In [11]:
# Create similartity score list 
sim_list=[]

for i in title_list:
    try:
        sim = calc_similarity(article_title, i, model_word2vec)
        sim_list.append(sim)
    except:
        #sim_list.append(0, 'ERROR ZERO DIV '+i)
        sim_list.append(0)

In [12]:
# Finidng most similar titles in a descending order of similarity scores
import pandas as pd
df = pd.DataFrame(list(zip(title_list, sim_list)),columns =['Title', 'Similarity'])
most_similar = df.sort_values(['Similarity'], ascending=0)

In [None]:
most_similar[:600]

### Extractive Text Summarization  

In [14]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.parsers.html import HtmlParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

class TextSummary(object):

    def __init__(self, feeds_str, num_sents):
        self.summary = str()
        
        parser = PlaintextParser.from_string(feeds_str, Tokenizer("english"))
        summarizer = LexRankSummarizer()

        sentences = summarizer(parser.document, num_sents)  # Summarize the document with 5 sentences
        for sentence in sentences:
            self.summary += (sentence.__unicode__())

    def output(self):
        return self.summary

In [15]:
feed_text = []

for feed in google_json:
    a = json.loads(feed)
    feed_text.append(a['text'])

### LDA topics (Assignment 8)
       {0: ['cloud', 'technology', 'team', 'digital', 'health', 'network', 'platform', 'design', 'tool', 'develop'],
        1: ['page', 'https', 'website', 'web', 'site', 'chrome', 'browser', 'link', 'file', 'user'],
        2: ['trump', 'podcast', 'president', 'tech', 'law', 'privacy', 'government', 'order', 'tweet', 'claim'], 
        3: ['police', 'black', 'coronavirus', 'city', 'health', 'case', 'officer', 'floyd', 'protest', 'pm'],
        4: ['million', 'india', 'per', 'pay', 'digital', 'increase', 'billion', 'stock', 'businesses', 'revenue'],
        5: ['game', 'good', 'really', 'lot', 'school', 'nt', 'things', 'students', 'something', 'every'],
        6: ['android', 'phone', 'apps', 'apple', 'de', 'game', 'device', 'pixel', 'store', 'devices'],
        7: ['log', 'smart', 'tv', 'amazon', 'music', 'voice', 'stream', 'assistant', 'youtube', 'never']}

### Manual Data Exploration

In [None]:
title_list

In [18]:
# BUILD YOUR OWN TAXONOMY BASED ON LDA and MANUAL DATA EXPLORATION
topic_taxonomy = {
    "Business" : 
    {
        "Business Competition" : "competition rivalry consumer customer market share conflict fight",
        "Business Operations" : "reopen operations integration program recruiting sales performance investors",
        "Business Expansion" : "growth market arena barrier conglomerate takeover buyout buy venture pilot partnership partner",
        "Business Tech" : "innovate blockchain cloud enterprise applications public robots automation IoT AI research",
        "Business Law" : "sue countersue law lawyer illegal espionage settlement contract breach nda disclosure trade secrets",
        "Business Performance" : "stock market revenue dividend nasdaq ticker dive increase invest investors"
    },
    
    "Products" :
    {
        "Product Failure" : "recall defect defective failure fail poor issue problem bug",
        "Product Trends" : " innovate innovative virtual platform cloud technology market team digital health network platform design tool",
        "Product Release" : "new release unveil announce offer launch expand develop"
    },
    
    "Incident" :
    {
        "Disease Outbreak": "Covid-19 health quarantine corona virus coronavirus impact cases recover survivors essential update",
        "Violent Incident": "police city case officer shooting death murder killed robbery crime",
        "Protests" : "black lives matter protest march george floyd racism occupy shooting breonna taylor minority rights",
        "Security Breach": "security hack breach hacker password release data confidential information"
    },
    
    "Technology" :
    {
        "Mobile Tech" :  "android ios apple iphone samsung huawei google pixel device 5g 4g network tower mobile phone cellphone smartphone",
        "Gaming Tech" : "stadia play store microsoft xbox playstation sony nintendo switch vr virtual reality game gaming",
        "Streaming Tech" : "video youtube netflix hulu disney+ starz hbo amazon prime twitch stream on demand zoom tiktok instagram",
        "Payment Tech" : "credit card contactless venmo paypal google pay apple pay square cash"
    },
    
    "Government" :
    {
        "President" :  "trump obama president presidential 45th veto election",
        "Gov Regulations" : "embargo law privacy regulation governance ban injunction",
        "Gov Politics" : "congress representatives house senate judicial judge rules ruling bill economy",
        "Gov Investigation" : "probe investigate investigation allege allegation FBI CIA charges accusation",
        "Gov Relations" : "washington europe united nations china beijing india embassy military dispute refugee visa passport"
    },

    "Artificial Intelligence" : 
    {
        "AI Assistant" : "AI virtual log voice match recognition activation amazon alexa siri google home activate personal assistant notes computer",
        "AI Market" : "AI biotech fintech insurance vehicles healthcare detection self driving global cloud",
        "AI Home" : "AI enabled smart TV music stream lights camera home security google amazon alexa echo mini nest network"
    }
}

In [31]:
# function takes an input string, runs similarity for each item in topic_taxonomy, sorts and returns top 3 results
def classify_topics(input, vectors):
    feed_score = dict()
    for key, value in topic_taxonomy.items():
        max_value_score = dict()
        for label, keywords in value.items():
            max_value_score[label] = 0
            topic = (key + ' ' + keywords).strip()
            max_value_score[label] += float(calc_similarity(input, topic, vectors))
            
        sorted_max_score = sorted(max_value_score.items(), key=operator.itemgetter(1), reverse=True)[0]
        feed_score[sorted_max_score[0]] = sorted_max_score[1]
    return sorted(feed_score.items(), key=operator.itemgetter(1), reverse=True)[:1]

In [338]:
lst1= []
lst2=[]
for topic in topic_taxonomy.keys():
    lst1.append(topic)
    lst2.append(list(topic_taxonomy[topic].keys()))

In [339]:
newlst= list(zip(lst1,lst2))

In [340]:
newlst[1][1]

['Product Failure', 'Product Trends', 'Product Release']

In [389]:
newlst

[('Business',
  ['Business Competition',
   'Business Operations',
   'Business Expansion',
   'Business Tech',
   'Business Law',
   'Business Performance']),
 ('Products', ['Product Failure', 'Product Trends', 'Product Release']),
 ('Incident',
  ['Disease Outbreak', 'Violent Incident', 'Protests', 'Security Breach']),
 ('Technology',
  ['Mobile Tech', 'Gaming Tech', 'Streaming Tech', 'Payment Tech']),
 ('Government',
  ['President',
   'Gov Regulations',
   'Gov Politics',
   'Gov Investigation',
   'Gov Relations']),
 ('Artificial Intelligence', ['AI Assistant', 'AI Market', 'AI Home'])]

In [None]:
for i in title_list:
    output = classify_topics(i,model_word2vec)

    print(i, output)

In [33]:
output_list = []
titles=[]

for i in title_list:
    titles.append(i)

for i in title_list:
    output = classify_topics(i,model_word2vec)
    output_list.append(output)

In [250]:
labels=[]
values=[]

for x in range(len(output_list)):
    labels.append(output_list[x][0][0])
    

for x in range(len(output_list)):
    values.append(output_list[x][0][1])

In [251]:
df = pd.DataFrame(list(zip(titles, labels, values)),columns =['Title', 'Topic','Value'])

In [390]:
# Topics and subtopics with 10 closest titles

newlst[1][0]
len(newlst)
for x in range(len(newlst)):
    print("Topic: ", newlst[x][0])
    workinglst = newlst[x]
    for i in range((len(workinglst)+1)):        
        print('\n\nSubtopic: ',workinglst[1][i],'\n')
        working =df[df['Topic']==workinglst[1][i]]
        short_list = working.sort_values(by='Value', ascending=0)[:10]
        for i in range(10):
            print(short_list.iloc[i][2],short_list.iloc[i][0])
    

Topic:  Business


Subtopic:  Business Competition 

0.5479925274848938 Reflecting on DuckDuckGo's rise as the privacy-focused search engine and the possibility of increased market share because of EU regulatory pressure (Matt Burgess/WIRED UK)
0.5377374887466431 Sources: India's antitrust body is looking into allegations that Google is abusing its market position to unfairly promote its mobile payments app Google Pay (Reuters)
0.5278651714324951 Consumer experience of online grocery must improve, says app start-up
0.5157110691070557 Google faces India mobile payments competition claim
0.5131257176399231 Google sues Sonos, escalating wireless speaker battle amid trade panel probe - Midwest Communication
0.5097876787185669 WhatsApp Pay struggles in India’s mobile payments market dominated by Google, Walmart, Paytm
0.5092771053314209 Disney+ beats Netflix in consumer satisfaction survey
0.5074613094329834 India’s telecom wars to heat up? Google eyes 5% stake in struggling Vodafone-Idea –