In [6]:
import joblib
import pandas as pd
import numpy as np
from datetime import datetime

import contractions

from nltk.tokenize import word_tokenize
from string import punctuation


from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS
import spacy

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.parsing.preprocessing import remove_stopwords

In [7]:
df_articles = pd.read_csv("../data/interim/articles_processed.csv")

In [8]:
df_articles.article_published_on = df_articles.article_published_on.astype(np.datetime64)

In [9]:
df_train,df_test = df_articles[df_articles.article_published_on<datetime(year=2021,day=20,month=8)],df_articles[df_articles.article_published_on>=datetime(year=2021,day=20,month=8)]

## Functions

In [58]:
# processed_body = text_pipeline(df_test.article_body)

In [59]:
# processed_heading = text_pipeline(df_test.article_heading)

In [64]:
# processed_text = pd.DataFrame({'article_body':processed_body,'heading':processed_heading},index=df_test.index)

In [66]:
# processed_text.to_csv("../data/processed/processed_test_data.csv")

In [10]:
def text_pipeline(X):
    if isinstance(X, str):
        X = pd.Series(X)
    elif isinstance(X, (pd.Series, pd.DataFrame)):
        pass
    else:
        raise Exception(
            f"Input should either be in 'str' format or a 'series' or 'Dataframe' with a column of text. Received an object of type {type(X)}"
        )
    
    expanded_contractions = X.apply(lambda x:
        contractions.fix(x)
    )
    
    lower = expanded_contractions.str.lower()
    
    custom_preprocessor = lower.apply(lambda x: x.replace("-"," ").replace("'s","").replace("’s","").replace("–",""))

    # punctuations
    removed_punctuation = custom_preprocessor.apply(
        lambda x: "".join([c for c in x if c not in punctuation])
    )

    # stop words
    stop_words = stopwords.words("english")
    removed_stop_words = removed_punctuation.apply(
        lambda x: " ".join(
            [word for word in word_tokenize(x) if word not in stop_words]
        )
    )
    removed_stop_words = removed_stop_words.apply(lambda x: remove_stopwords(x))
    all_stopwords_gensim = STOPWORDS.union(
        set(["the", "say", "said", "get", "it", "in", "like", "new", "year"])
    )
    removed_stop_words = removed_stop_words.apply(
        lambda x: " ".join(
            [word for word in word_tokenize(x) if word not in all_stopwords_gensim]
        )
    )
    sp = spacy.load('en_core_web_sm')
    all_stopwords = sp.Defaults.stop_words
    removed_stop_words = removed_stop_words.apply(
        lambda x: " ".join(
            [word for word in word_tokenize(x) if word not in all_stopwords]
        )
    )

    # Stemming and Lematizing
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stem = removed_stop_words.apply(
        lambda x: " ".join([stemmer.stem(word) for word in word_tokenize(x)])
    )
    lemma = stem.apply(
        lambda x: " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(x)])
    )

    return lemma   

In [11]:
def text_vectorizer(X,vectorizer,fit=False):
    if fit:
        return vectorizer.fit_transform(X)
    else:
        return vectorizer.transform(X)

In [12]:
def get_topic_vectors(X,model,fit=False):
    if fit:
        return model.fit_transform(X)
    else:
        return model.transform(X)

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
def get_similar_articles(text_vectors, X, top_n_values=10):
    """
    Evalute the cosine similarity between provided 'text_vectors' and trained X (articles trained and stored as a vecotr of topics).
    Return dataframe with index as trained articles and columns as text_vector indices with values as similarity scores
    """
    similarity_scores = cosine_similarity(X,text_vectors,dense_output=True)
    return similarity_scores
#     return np.argsort(similarity_scores, axis=0)[::-1,:][:top_n_values,:]

In [14]:
pd.set_option("display.max_colwidth", 3)
def print_similar_articles(test_indices,similarity_scores):
    values = np.sort(similarity_scores, axis=0)[::-1,:][:5,:]
    similarity_array = np.argsort(similarity_scores, axis=0)[::-1,:][:5,:]
    for i in range(similarity_array.shape[1]):
        indices = similarity_array[:,i]
#         print("\n")
#         print(df_test.iloc[test_indices[i]].article_heading)
#         print("\n")
#         print(df_train.iloc[indices].article_heading)
#         print("\n")
#         print(values[:,i])
        return df_test.iloc[test_indices[i]].article_heading,pd.DataFrame({'article_heading':df_train.iloc[indices].article_heading,'Similarity score':values[:,i],'article_date':df_train.iloc[indices].article_published_on})

In [47]:
def load_saved_models_to_memory(n_components):
    base_path = "D:\\Models\\news recommender\\"
    if n_components == 300:
        topic_vectors_train = joblib.load(base_path + "vectorizer_0830_1513_300")
        vectorizer = joblib.load(base_path + "lda_model_0830_1513_300")
        model = joblib.load(base_path + "topic_vector_train_0830_1513_300")
    elif n_components == 240:
        topic_vectors_train = joblib.load(base_path + "vectorizer_0830_1406_240")
        vectorizer = joblib.load(base_path + "lda_model_0830_1406_240")
        model = joblib.load(base_path + "topic_vector_train_0830_1406_240")
    elif n_components == 180:
        topic_vectors_train = joblib.load(base_path + "vectorizer_0830_1304_180")
        vectorizer = joblib.load(base_path + "lda_model_0830_1304_180")
        model = joblib.load(base_path + "topic_vector_train_0830_1304_180")
    elif n_components == 150:
        topic_vectors_train = joblib.load(base_path + "vectorizer_0830_1205_150")
        vectorizer = joblib.load(base_path + "lda_model_0830_1205_150")
        model = joblib.load(base_path + "topic_vector_train_0830_1205_150")
    elif n_components == 120:
        topic_vectors_train = joblib.load(base_path + "vectorizer_0830_1109_120")
        vectorizer = joblib.load(base_path + "lda_model_0830_1109_120")
        model = joblib.load(base_path + "topic_vector_train_0830_1109_120")
    elif n_components == 90:
        topic_vectors_train = joblib.load(base_path + "vectorizer_0830_1015_90")
        vectorizer = joblib.load(base_path + "lda_model_0830_1015_90")
        model = joblib.load(base_path + "topic_vector_train_0830_1015_90")
    elif n_components == 60:
        topic_vectors_train = joblib.load(base_path + "vectorizer_0830_0925_60")
        vectorizer = joblib.load(base_path + "lda_model_0830_0925_60")
        model = joblib.load(base_path + "topic_vector_train_0830_0925_60")
    elif n_components == 30:
        topic_vectors_train = joblib.load(base_path + "vectorizer_0830_0838_30")
        vectorizer = joblib.load(base_path + "lda_model_0830_0838_30")
        model = joblib.load(base_path + "topic_vector_train_0830_0838_30")
    saved_models[n_components]["topic_vectors_train"]=topic_vectors_train
    saved_models[n_components]["vectorizer"]=vectorizer
    saved_models[n_components]["model"]=model

In [48]:
saved_models = {300:{},240:{},180:{},150:{},120:{},90:{},60:{},30:{}}
for component in components_saved:
    load_saved_models_to_memory(component)

In [15]:
# def get_saved_models(n_components):
#     base_path = "D:\\Models\\news recommender\\"
#     if n_components == 300:
#         topic_vectors_train = joblib.load(base_path + "vectorizer_0830_1513_300")
#         vectorizer = joblib.load(base_path + "lda_model_0830_1513_300")
#         model = joblib.load(base_path + "topic_vector_train_0830_1513_300")
#     elif n_components == 240:
#         topic_vectors_train = joblib.load(base_path + "vectorizer_0830_1406_240")
#         vectorizer = joblib.load(base_path + "lda_model_0830_1406_240")
#         model = joblib.load(base_path + "topic_vector_train_0830_1406_240")
#     elif n_components == 180:
#         topic_vectors_train = joblib.load(base_path + "vectorizer_0830_1304_180")
#         vectorizer = joblib.load(base_path + "lda_model_0830_1304_180")
#         model = joblib.load(base_path + "topic_vector_train_0830_1304_180")
#     elif n_components == 150:
#         topic_vectors_train = joblib.load(base_path + "vectorizer_0830_1205_150")
#         vectorizer = joblib.load(base_path + "lda_model_0830_1205_150")
#         model = joblib.load(base_path + "topic_vector_train_0830_1205_150")
#     elif n_components == 120:
#         topic_vectors_train = joblib.load(base_path + "vectorizer_0830_1109_120")
#         vectorizer = joblib.load(base_path + "lda_model_0830_1109_120")
#         model = joblib.load(base_path + "topic_vector_train_0830_1109_120")
#     elif n_components == 90:
#         topic_vectors_train = joblib.load(base_path + "vectorizer_0830_1015_90")
#         vectorizer = joblib.load(base_path + "lda_model_0830_1015_90")
#         model = joblib.load(base_path + "topic_vector_train_0830_1015_90")
#     elif n_components == 60:
#         topic_vectors_train = joblib.load(base_path + "vectorizer_0830_0925_60")
#         vectorizer = joblib.load(base_path + "lda_model_0830_0925_60")
#         model = joblib.load(base_path + "topic_vector_train_0830_0925_60")
#     elif n_components == 30:
#         topic_vectors_train = joblib.load(base_path + "vectorizer_0830_0838_30")
#         vectorizer = joblib.load(base_path + "lda_model_0830_0838_30")
#         model = joblib.load(base_path + "topic_vector_train_0830_0838_30")
#     return topic_vectors_train, vectorizer, model

In [16]:
components_saved = [300,240,180,150,120,90,60,30]

## Ensemble

In [17]:
def get_similar_articles(test_indices,similarity_scores, top_n_values = 5):
    values = np.sort(similarity_scores, axis=0)[::-1,:][:top_n_values,:]
    similarity_array = np.argsort(similarity_scores, axis=0)[::-1,:][:top_n_values,:]
    for i in range(similarity_array.shape[1]):
        indices = similarity_array[:,i]
    return df_test.iloc[test_indices[i]].article_heading,pd.DataFrame({'article_heading':df_train.iloc[indices].article_heading,'Similarity score':values[:,i],'article_date':df_train.iloc[indices].article_published_on})

In [18]:
import math
def get_article_importance_day_wise(growth=1000):
    diff_from_max_date = (df_train.article_published_on-(df_train.article_published_on.max())).dt.days
    return np.exp(diff_from_max_date/growth)

In [19]:
def get_similarity_score(text_vectors, X, factor=None):
    """
    Evalute the cosine similarity between provided 'text_vectors' and trained X (articles trained and stored as a vecotr of topics).
    Return dataframe with index as trained articles and columns as text_vector indices with values as similarity scores
    """
    similarity_scores = cosine_similarity(X,text_vectors,dense_output=True)
    return similarity_scores*factor
#     return np.argsort(similarity_scores, axis=0)[::-1,:][:top_n_values,:]

In [49]:
def process_news_article(component,include_headings=False,heading_weightage=0.6,test_indices=None,factor=np.ones((df_train.shape[0])).reshape(-1,1)):
    topic_vectors_train,vectorizer,model = saved_models[component]["topic_vectors_train"],saved_models[component]["vectorizer"],saved_models[component]["model"]
    test_lemmas = text_pipeline(df_test.iloc[test_indices].article_body)
    lemma_test_vectors = text_vectorizer(test_lemmas,vectorizer)
    topic_vectors_test = get_topic_vectors(lemma_test_vectors,model)
    similarity_scores = get_similarity_score(topic_vectors_test,topic_vectors_train,factor=factor)
    heading, result = get_similar_articles(test_indices,similarity_scores)
    if include_headings:
        test_lemmas = text_pipeline(df_test.iloc[test_indices].article_heading)
        lemma_test_vectors = text_vectorizer(test_lemmas,vectorizer)
        topic_vectors_test = get_topic_vectors(lemma_test_vectors,model)
        similarity_scores = heading_weightage*(get_similarity_score(topic_vectors_test,topic_vectors_train,factor=factor))+(1-heading_weightage)*similarity_scores
        _, result = get_similar_articles(test_indices,similarity_scores)
    return heading, result

In [82]:
del saved_models

In [69]:
processed_text = pd.read_csv(r"../data/processed/processed_test_data.csv",index_col=0)

Index(['article_body', 'heading'], dtype='object')

In [78]:
def ensemble_similarity_scores(components,include_headings=False,heading_weightage=0.6,test_indices=None,factor=np.ones((df_train.shape[0])).reshape(-1,1)):
    component_similarity_scores = []
    for component in components:
        topic_vectors_train,vectorizer,model = saved_models[component]["topic_vectors_train"],saved_models[component]["vectorizer"],saved_models[component]["model"]
#         test_lemmas = text_pipeline(df_test.iloc[test_indices].article_body)
        test_lemmas = processed_text.iloc[test_indices].article_body
        lemma_test_vectors = text_vectorizer(test_lemmas,vectorizer)
        topic_vectors_test = get_topic_vectors(lemma_test_vectors,model)
        similarity_scores = get_similarity_score(topic_vectors_test,topic_vectors_train,factor=factor)
#         heading, result = get_similar_articles(test_indices,similarity_scores)
        if include_headings:
#             test_lemmas = text_pipeline(df_test.iloc[test_indices].article_heading)
            test_lemmas = processed_text.iloc[test_indices].heading
            lemma_test_vectors = text_vectorizer(test_lemmas,vectorizer)
            topic_vectors_test = get_topic_vectors(lemma_test_vectors,model)
            similarity_scores = heading_weightage*(get_similarity_score(topic_vectors_test,topic_vectors_train,factor=factor))+(1-heading_weightage)*similarity_scores
#             _, result = get_similar_articles(test_indices,similarity_scores)
        component_similarity_scores.append(similarity_scores)
    return component_similarity_scores

In [79]:
%%timeit
import random
test_indices = random.sample(range(df_test.shape[0]), 1)
# test_indices = [0]
factor=np.ones((df_train.shape[0])).reshape(-1,1)
weights = [
        0.0392685,
        0.09838475,
        0.04760199,
        0.05147573,
        0.04382252,
        0.0741635,
        0.04844888,
        0.59683412,
    ]
# weights = np.random.dirichlet(np.ones(8),size=1).reshape(8,)
# factor = get_article_importance_day_wise(growth=1000).values.reshape(-1,1)
component_similarity_scores = ensemble_similarity_scores(components=components_saved,include_headings=True,heading_weightage=0.75,test_indices=test_indices,factor=factor)
# similarity_scores = np.average(np.array(component_similarity_scores),axis=0,weights=weights)
# heading, result = get_similar_articles(test_indices,similarity_scores,top_n_values=10)
# print(heading)
# display(result)

502 ms ± 52.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [52]:
import random
test_indices = random.sample(range(df_test.shape[0]), 1)
# test_indices = [0]
factor=np.ones((df_train.shape[0])).reshape(-1,1)
weights = [
        0.0392685,
        0.09838475,
        0.04760199,
        0.05147573,
        0.04382252,
        0.0741635,
        0.04844888,
        0.59683412,
    ]
%lprun -f ensemble_similarity_scores ensemble_similarity_scores(components=components_saved,include_headings=True,heading_weightage=0.75,test_indices=test_indices,factor=factor)

Timer unit: 1e-07 s

Total time: 8.75041 s
File: C:\Users\rvams\AppData\Local\Temp/ipykernel_23776/2654383235.py
Function: ensemble_similarity_scores at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def ensemble_similarity_scores(components,include_headings=False,heading_weightage=0.6,test_indices=None,factor=np.ones((df_train.shape[0])).reshape(-1,1)):
     2         1         27.0     27.0      0.0      component_similarity_scores = []
     3         9         95.0     10.6      0.0      for component in components:
     4         8        202.0     25.2      0.0          topic_vectors_train,vectorizer,model = saved_models[component]["topic_vectors_train"],saved_models[component]["vectorizer"],saved_models[component]["model"]
     5         8   40576782.0 5072097.8     46.4          test_lemmas = text_pipeline(df_test.iloc[test_indices].article_body)
     6         8     154899.0  19362.4      0.2          lemm