In [1]:
# imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import fasttext as ft # language recognition
from pycountry import languages # to convert language ISO to language name
from collections import Counter # for counting

ModuleNotFoundError: No module named 'fasttext'

## Approach for classification of news article:
1. Detect languages (other than English) using fasttext.
**Done**
2. Convert languages other than English to English (using BERT) **Shelved**
3. Can we use knn clustering for this unsupervised classification problem. **Shelved**
4. Can we get one keyword, preferably the subject part-of-speech, for each document, and given our known output categories, somehow link each keyword to a category? **Done**
5. Is it possible to use the vocab tree generated by word2vec as a decision tree for classification? **To do**
6. Lastly, display a tSNE plot colour-differentiated by their predicted category **tSNE plot done, yet to classify by category**

## Sub-tasks to be improved:

1. One keyword appears in more than one topic, while using LDA. **Expected behaviour**: A keyword must occur in only one topic.
2. Find optimal number of total topics (LDA)
3. Limit number of words per topic (LDA)
4. LDA returns topics containing words within documents. How to link words within topics to its general synonyms?
<br>
<br>
Eg: The following sentence,'Whipping up nationalism has a shelf life: Kamal Nath' contains 89% of the topic:
<br>
<br>
0.015*"back" + 0.015*"fighting" + 0.015*"homemade" + 0.015*"demand" + 0.015*"purifier" + 0.015*"smog" + 0.015*"pakistanis" + 0.015*"shelf" + 0.015*"whipping" + 0.015*"nationalism".
<br>
<br>
Here, the words 'nationalism' and 'shelf' have been picked up. How to link this to 'politics'?
5. Hypertune the parameters of word2vec to produce better word embeddings

In [None]:
import random

n = 1411104 #number of records in file
s = 100000 #desired sample size
filename = "../input/times-internet-news/Train_data (News data set).csv"
skip = sorted(random.sample(range(n),n-s))
col_names = ['title','link','description','long_description','id']

data = pd.read_csv(filename,sep='|',index_col=None,names=col_names,skiprows=skip)
data = data.dropna(how='any')
data.head()

## Pecularities about the data

1. High number of proper nouns (people, places, brands) in the dataset. Since these proper nouns do not occur too many times, the model must be adept at assigning a relevant class to them.

In [None]:
#NOT IN USE
#using 1/10th of the dataset
newsData = df[:df.shape[0]//10]
newsData.info()

In [None]:
#NOT IN USE
#to split dataset range into buckets of equal size (for batch processing to not overwhelm CPU)
total = df.shape[0]
nBuckets = 10
if total % nBuckets != 0:
    remainder = total%nBuckets
    total = total - remainder

limit = total//nBuckets
buckets = []
for i in range(1,nBuckets):
    buckets.append(i*limit)

print(buckets)

In [None]:
#function to detect language of title and return list of languages
def detect_lang(dataframe, startLimit, endLimit, langs):
    for title in dataframe.title[startLimit+1:endLimit]:
        title = title.strip()
        title = title.replace('\n','')
        lout = lid_model.predict(title)
        print(lout)
        lstr = ''.join(lout[0])
        detected_lang = lstr.split('__')[-1]
        if len(detected_lang) == 2:
            try:
                lang_name = languages.get(alpha_2=detected_lang).name
                print(lang_name)
                langs.append(lang_name)
            except:
                langs.append('New')
        else:
            langs.append('New')
    return langs

In [None]:
# initialising FastText model
lid_model = ft.load_model("../input/fasttext-language-prediction/lid.176.bin")
langs = []

#detecting title language
%time langs = detect_lang(data, -1, data.shape[0], langs)

In [None]:
print(len(langs))

lang_counts = Counter(langs).most_common()
for language,count in lang_counts:
    print(language,count)

In [None]:
#adding column 'title_language' to newsData for title language
data['title_language'] = langs
print(data.info())

In [None]:
#creating list of news content (title + description)

data['content'] = data['title']+' '+data['description']
contents = data.content[data.title_language=='English']
contentList = contents.to_numpy()
print(contentList[0:10])

## Keyword extraction from each title using tf-IDF scores

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv=CountVectorizer(stop_words='english')
word_count_vector=cv.fit_transform(contentList)

#display sample 10 words from vocabulary
list(cv.vocabulary_.keys())[:10]

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

#calculating IDF scores for each word
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

In [None]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuple of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [None]:
feature_names=cv.get_feature_names()

#sample test case
doc=contentList[19]

#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())

#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,5)

# now print the results
print("\n=====Doc=====")
print(doc)
print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])

In [None]:
allkeywords = []
for iterator in range(len(contentList)):
    keywords = []
    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([contentList[iterator]]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,5)
    
    keywords = [k for k in keywords]
    print(keywords)
    allkeywords.append(keywords)

In [None]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer

#regex for capturing only words
tokenizer = RegexpTokenizer(r'\w+')

#to initialize list of English stop words
en_stop = get_stop_words('en')

#create stemmer
p_stemmer = PorterStemmer()

#list to add tokenized documents to 
texts = []

for i in contentList:
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    tokens = [i for i in tokens if len(i)>1] #to remove one-letter characters that are created as a result of tokenization
    stopped_tokens = [i for i in tokens if not i in en_stop]
#     stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    texts.append(stopped_tokens)

print(texts)

## Word2Vec for word embeddings

In [None]:
import gensim
import pickle

#building Word2Vec model
%time w2vmodel = gensim.models.Word2Vec(texts,size=500,window=20,min_count=2,workers=10,iter=10)

#saving the model
filename = 'word2vec_model'
w2vmodel.wv.save(filename)

#TRY IF USING pickle
# filename = 'finalized_model.sav'
# pickle.dump(w2vmodel, open(filename, 'wb'))
# loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
#NOT IN USE
import gensim
word2vec_path = "../input/nlpword2vecembeddingspretrained/GoogleNews-vectors-negative300.bin"
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [None]:
# read and display categories to be used for classification
cats = pd.read_csv('../input/time-internet-new-categories/categories_data.csv')
cats = cats.dropna(how='any')
cats

In [None]:
categories = []
for category in cats.refined_categories:
    category = category.lower()
    category = category.split()
    category = [k for k in category if len(k)>1]
    categories.append(category)

print(categories)

In [None]:
allsimilarities = []
for keyword in allkeywords:
    category_similarities = []
    for category in categories:
        similarities=0
        for word in category:
            for key in keyword:
                try:
                    similarities += w2vmodel.wv.similarity(key,word)
                except:
                    similarities += 0
        similarity = similarities/(len(category)*len(keyword))
        category_similarities.append(round(similarity,3))
    allsimilarities.append(category_similarities)

In [None]:
# sample list of similarity values to every category for a news content
print('News keywords: \n',allkeywords[1])
print('\nSimilarities:')
for i in range(len(categories)):
    print(categories[i],': ',allsimilarities[1][i])

In [None]:
text_categories = []
similarity_score = []
for i in range(len(allsimilarities)):
    index = allsimilarities[i].index(max(allsimilarities[i]))
    text_categories.append(categories[index])
    similarity_score.append(allsimilarities[i][index])

summary_dict = {'content':contentList,'category':text_categories,'similarity score':similarity_score}
summary_df = pd.DataFrame(summary_dict)
summary_df.head(10)

## Smooth Inverse Frequency for sentence vectors

In [None]:
from collections import Counter
import itertools

#to give frequency of each word in the corpus/document
def map_word_frequency(document):
    return Counter(itertools.chain(*document))

#to return list of sentence vectors
def get_sif_feature_vectors(sentence1, sentence2, word_emb_model=w2vmodel):
    sentence1 = [token for token in sentence1.split() if token in word_emb_model.wv.vocab]
    sentence2 = [token for token in sentence2.split() if token in word_emb_model.wv.vocab]
    print(sentence1)
    print(sentence2)
    docs = [sentence1, sentence2]
    word_counts = map_word_frequency(docs)
    print(word_counts)
    embedding_size = 500 # size of vectore in word embeddings
    a = 0.001
    sentence_set=[]
    for sentence in [sentence1, sentence2]:
        vs = np.zeros(embedding_size)
        sentence_length = len(sentence)
        print(sentence_length)
        for word in sentence:
            print(word)
            print(word_counts[word])
            a_value = a / (a + word_counts[word]) # smooth inverse frequency, SIF
            print(a_value)
            vs = np.add(vs, np.multiply(a_value, word_emb_model.wv[word])) # vs += sif * word_vector
        vs = np.divide(vs, sentence_length) # weighted average
        sentence_set.append(vs)
    return sentence_set

## t-SNE for word2vec model visualization

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

labels = []
tokens = []

for word in w2vmodel.wv.vocab:
    tokens.append(w2vmodel.wv[word])
    labels.append(word)
    
tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
%time new_values = tsne_model.fit_transform(tokens)

In [None]:
x = []
y = []
for value in new_values:
    x.append(value[0])
    y.append(value[1])
         
plt.figure(figsize=(16, 16)) 
for i in range(len(x)):
    plt.scatter(x[i],y[i])
    plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
plt.show()

## Topic Modelling using LDA

In [None]:
from gensim import corpora,models

#creating dictionary by assigning id to each token in our token list
dictionary = corpora.Dictionary(texts)
print(dictionary.token2id)

In [None]:
import gensim

#converting our dictionary to bag-of-words model
corpus = [dictionary.doc2bow(text) for text in texts]

#defining number of topics (to be retrived by LDA)
num_topics = 3

#generating LDA model
%time ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, \
                                                 id2word=dictionary, \
                                                 passes=4, \
                                                 eta=2)

In [None]:
#display results
for i,topic in ldamodel.show_topics(formatted=True, num_topics=num_topics):
    print(str(i)+": \n"+ topic+"\n")

In [None]:
#printing topics for each document
for i in range(len(corpus)):
    print(ldamodel[corpus[i]])
    print('\n')