In [5]:
import pandas as pd
# import spacy
import pickle
import os
import random
import numpy as np
from numpy.linalg import norm
from datetime import datetime

os.getcwd()

'd:\\OneDrive - Universiteit Utrecht\\PER3_PersonalizationForPublicMedia\\assignment2\\General\\Code'

In [21]:
# articles = pd.read_csv('data/articles_cleaned.csv')
# open cleaned data
with open("data/articles_cleaned.pkl", "rb") as file:
    articles = pickle.load(file)

articles_topics = pd.read_csv('data/articles_topics.csv')
articles_topics = articles_topics.loc[:, articles_topics.columns != 'Unnamed: 0']

personas = pd.read_csv('data/personas.csv')
personas = personas.loc[:, personas.columns != 'persona']

article_scores_personas = pd.read_csv('data/article_scores_personas.csv')
article_scores_personas = article_scores_personas.loc[:, article_scores_personas.columns != 'Unnamed: 0']

articles_embeddings = pd.read_csv('data/articles_embeddings.csv')

In [7]:
def calculate_articlesInterest():
    #Calculate interest score for every article
    #needs topic df and uservector
    #Outputs series objects filled with scores for every article
    
    #Transform into arrays
    articles_matrix = articles_topics.to_numpy()

    #Only use cols that correspond to interests
    interest_cols = user.index[user.index.str.startswith('interest_')]
    user_vector = np.array(user[interest_cols])
    
    #Calculate new scores using matrix multiplication
    new_scores_matrix = np.dot(articles_matrix, user_vector)
    new_scores_series = pd.Series(new_scores_matrix)

    return new_scores_series

In [8]:
def calculate_articlesTimeliness(currentArticle_index):
    #Calculate timeliness score for every article compared to currently reading article
    #needs article df
    #Outputs series object filled with scores for every article
    
    #Get current date
    currentArticle = articles.loc[currentArticle_index]
    currentDate = currentArticle['datenumber']
    
    #Calculate the difference between current date and dates of all articles
    datenumbers_articles = np.array(articles['datenumber'])
    datedifferences = abs(datenumbers_articles - currentDate)
    
    #Normalize to values between 0 and 1
    latestdate = articles['datenumber'].iloc[-1]
    normalized_datedifferences = [x / latestdate for x in datedifferences]
    
    #Dates that have large differences should be scored lower than ones closeby
    #So invert these values by subtracting it from 1
    datescores = [abs(1-x) for x in normalized_datedifferences]
    
    #Return as a series
    datescores_series = pd.Series(datescores)
    
    return datescores_series

calculate_articlesTimeliness(22585)

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
22581    1.0
22582    1.0
22583    1.0
22584    1.0
22585    1.0
Length: 22586, dtype: float64

In [9]:
def calculate_articlesSimilarity(currentArticle_index):
    #Calculate similarity score for every article compared to currently reading article
    #needs topic df
    #Outputs series objects filled with scores for every article
    
    #Transform into arrays
    articles_matrix = articles_topics.to_numpy()
    currentArticle = articles_topics.loc[currentArticle_index]
    currentArticle_vector = np.array(currentArticle)
    
    #Calculate cosine similarity between current article and all others
    dot = np.dot(articles_matrix, currentArticle_vector)
    articles_matrix_norm = np.linalg.norm(articles_matrix, axis=1)
    currentArticle_vector_norm = np.linalg.norm(currentArticle_vector)
    norm = currentArticle_vector_norm * articles_matrix_norm
    cosinesims = dot / norm
    new_scores_series =  pd.Series(cosinesims)

    return new_scores_series

calculate_articlesSimilarity(1)

0        0.423152
1        1.000000
2        0.460118
3        0.000000
4        0.000000
           ...   
22581    0.261581
22582    0.357856
22583    0.567441
22584    0.502268
22585    0.637431
Length: 22586, dtype: float64

In [10]:
def calculate_articlesDistressed():
    #Calculate distressed score for every article
    #needs topic df and uservector
    #Outputs series objects filled with scores for every article
    
    #Transform into arrays
    articles_matrix = articles_topics.to_numpy()

    #Only use cols that correspond to distressed
    distressed_cols = user.index[user.index.str.startswith('distressed_')]
    user_vector = np.array(user[distressed_cols])
    
    #Calculate new scores using matrix multiplication
    new_scores_matrix = np.dot(articles_matrix, user_vector)
    new_scores_series = pd.Series(new_scores_matrix)
    
    #The scores need to be inverted as distress should have a negative
    #influence on recommendations
    return 1 - new_scores_series

In [48]:
def calculate_articlesGuard():
    #Calculate guard score for every article
    #needs embeddings df and uservector
    #Outputs series objects filled with scores for every article
    
    #Transform into arrays
    articles_matrix = articles_embeddings.to_numpy()/6

    #Only use cols that correspond to guard
    guard_cols = user.index[user.index.str.startswith('guard_')]
    user_vector = np.array(user[guard_cols])

    #Calculate new scores using matrix multiplication
    new_scores_matrix = np.dot(articles_matrix, user_vector)
    new_scores_series = pd.Series(new_scores_matrix)

    #The scores need to be inverted as triggers should have a negative
    #influence on recommendations
    return 1 - new_scores_series


In [57]:
def calculate_articlesColab():
    
    #Outputs series objects filled with scores for every article
    
    # calculate the similarity between the user and each of the five personas
    # results in a similarity vector
    
    #Transform into matrix and array
    personas_matrix = personas.to_numpy()
    user_vector = np.array(user)
    
    #Calculate cosine similarity between user and all personas
    dot = np.dot(personas_matrix, user_vector)
    personas_matrix_norm = np.linalg.norm(personas_matrix, axis=1)
    user_vector_norm = np.linalg.norm(user_vector)
    norm = personas_matrix_norm * user_vector_norm
    cosinesims = dot / norm

    # get article scores per persona    
    # Transform into arrays
    articles_matrix = article_scores_personas.to_numpy()/5

    # Calculate new scores using matrix multiplication
    new_scores_matrix = np.dot(articles_matrix, cosinesims)
    new_scores_series = pd.Series(new_scores_matrix)

    return new_scores_series


In [65]:
def recommend(currentArticle, weights = [1/6, 1/6, 1/6, 1/6, 1/6, 1/6], k = 3):
    # calculate the scores for each article
    # the following functions should output a series with index = articles.index and the values corresponding to the recommendations (if solely based on this component; between 0 and 1)
    article_scores['interest'] = calculate_articlesInterest()
    article_scores['timeliness'] = calculate_articlesTimeliness(currentArticle)
    article_scores['similarity'] = calculate_articlesSimilarity(currentArticle)
    article_scores['distress'] = calculate_articlesDistressed()
    article_scores['guard'] = calculate_articlesGuard()
    article_scores['colab'] = calculate_articlesColab()

    scores_weighted = weights * article_scores.iloc[:,:len(weights)]
    article_scores['total'] = scores_weighted.apply(lambda row : row.sum(), axis=1)

    recommended = pd.Series(article_scores['total']).sort_values(ascending = False)
    recommended = recommended.drop(labels = currentArticle)

    return recommended[:k] # output k articles from the top


In [67]:
# make the Series 'popup_interest' to be called in the function (currently filled with random values; needs to be filled with values from popup)
interest_cols = ['interest_EUPolitics', 'interest_crimes', 'interest_israelPalestine', 'interest_immigration', 'interest_sports', 'interest_war', 'interest_climateChange', 'interest_showArts', 'interest_covid', 'interest_britishBrexit', 'interest_instAbuse', 'interest_spaceTravel', 'interest_protests', 'interest_terrorism', 'interest_USPolitics', 'interest_naturalDisasters', 'interest_elections', 'interest_economy']
popup_interest = pd.Series(np.random.rand(len(interest_cols)), index = interest_cols) # just to initialise it with random values

# make the Series 'popup_guard' to be called in the function (currently filled with random values; needs to be filled with values from popup)
guard_cols = ['guard_suicide', 'guard_accidents', 'guard_selfHarm', 'guard_depression', 'guard_racism', 'guard_eatingDisorders']
popup_guard = pd.Series(np.random.rand(len(guard_cols)), index = guard_cols)

def newUser():
    global user
    allCols = ['interest_EUPolitics', 'interest_crimes', 'interest_israelPalestine', 'interest_immigration', 'interest_sports', 'interest_war', 'interest_climateChange', 'interest_showArts', 'interest_covid', 'interest_britishBrexit', 'interest_instAbuse', 'interest_spaceTravel', 'interest_protests', 'interest_terrorism', 'interest_USPolitics', 'interest_naturalDisasters', 'interest_elections', 'interest_economy', 'distressed_EUPolitics', 'distressed_crimes', 'distressed_israelPalestine', 'distressed_immigration', 'distressed_sports', 'distressed_war', 'distressed_climateChange', 'distressed_showArts', 'distressed_covid', 'distressed_britishBrexit', 'distressed_instAbuse', 'distressed_spaceTravel', 'distressed_protest', 'distressed_terrorism', 'distressed_USPolitics', 'distressed_naturalDisasters', 'distressed_elections', 'distressed_economy', 'guard_suicide', 'guard_accidents', 'guard_selfHarm', 'guard_depression', 'guard_racism', 'guard_eatingDisorders']
    user = pd.Series(index = allCols)

    interest_cols = user.index[user.index.str.startswith('interest_')]
    distressed_cols = user.index[user.index.str.startswith('distressed_')]
    guard_cols = user.index[user.index.str.startswith('guard_')]

    # initialise data from popup and mean responses from the survey
    user[interest_cols] = popup_interest
    user[distressed_cols] = [0.30492753623188407, 0.4394202898550725, 0.30818840579710144, 0.32768115942028986, 0.06818840579710145, 0.4698550724637681, 0.5271739130434783, 0.09978260869565217, 0.40050724637681157, 0.12144927536231885, 0.35594202898550725, 0.0722463768115942, 0.3279710144927536, 0.5227536231884058, 0.3071014492753623, 0.49130434782608695, 0.26057971014492753, 0.3505797101449275]
    user[guard_cols] = popup_guard


    global article_scores
    article_scores = pd.DataFrame(0, index = articles.index, columns = ['interest', 'timeliness', 'similarity', 'distress', 'guard', 'colab'])

    currentArticle = int(articles.index[articles['url'] == 'https://nos.nl/artikel/2467501-dodental-ongeluk-met-veerboot-in-gabon-loopt-op-tot-21'][0])

    print(recommend(currentArticle))


newUser()
#personas
# print(user)
# article_scores

  user = pd.Series(index = allCols)


22418    0.645447
22248    0.642494
22252    0.639297
Name: total, dtype: float64


  return recommended[:k] # output k articles from the top


In [98]:
def updateLike(likevalue, currentArticle_index, weight = 0.05):
    #update user interest scores based on liked/disliked article
    #The likevalue is 1 or -1, for like or dilike respectively
    #The weight corresponds to how fast the user interest scores change after one rating
    
    currentArticle = articles_topics.loc[currentArticle_index]
    
    #Make values negative or positive, based on like/dislike
    topicsvalues = list(currentArticle)
    topicsvalues = [i * likevalue for i in currentArticle]
    
    #Make changes based on weight
    topicsvalues = [i * weight for i in topicsvalues]
    
    #Get current interest scores of user
    interest_cols = user.index[user.index.str.startswith('interest_')]
    interestlist = np.array(user[interest_cols])
    
    #Change interest scores in user vector
    new_interests = []
    for topic, interest in zip(topicsvalues, interestlist):
        newvalue = topic+interest
        if newvalue > 1.0:
            newvalue = 1.0
        if newvalue < 0.0:
            newvalue = 0.0
        new_interests.append(newvalue)
    
    user[interest_cols] = new_interests
    
    return None

interest_cols = user.index[user.index.str.startswith('interest_')]
print(user[interest_cols])
updateLike(-1, 10)
print(user[interest_cols])
print(articles_topics.loc[10])


interest_EUPolitics          0.420677
interest_crimes              0.496259
interest_israelPalestine     0.423356
interest_immigration         0.312063
interest_sports              0.255396
interest_war                 0.401650
interest_climateChange       0.236670
interest_showArts            0.686719
interest_covid               0.640178
interest_britishBrexit       0.657202
interest_instAbuse           0.828107
interest_spaceTravel         0.831905
interest_protests            0.073675
interest_terrorism           0.510401
interest_USPolitics          0.537962
interest_naturalDisasters    0.423680
interest_elections           0.506276
interest_economy             0.577332
dtype: float64
interest_EUPolitics          0.420677
interest_crimes              0.496259
interest_israelPalestine     0.423356
interest_immigration         0.312063
interest_sports              0.237629
interest_war                 0.393333
interest_climateChange       0.236670
interest_showArts            0.6867

In [99]:
def updateClick(clickedArticle_index, weight = 0.01):
    #update user interests based on clicked articles
    #The weight corresponds to how fast the user interest scores change after one click
    
    clickedArticle = articles_topics.loc[clickedArticle_index]
    
    #Make changes based on weight
    topicsvalues = list(clickedArticle)
    topicsvalues = [i * weight for i in topicsvalues]
    
    #Get current interest scores of user
    interest_cols = user.index[user.index.str.startswith('interest_')]
    interestlist = np.array(user[interest_cols])
    
    #Change interest scores in user vector
    new_interests = []
    for topic, interest in zip(topicsvalues, interestlist):
        newvalue = topic+interest
        if newvalue > 1.0:
            newvalue = 1.0
        if newvalue < 0.0:
            newvalue = 0.0
        new_interests.append(newvalue)
        
    user[interest_cols] = new_interests
    
    return None

interest_cols = user.index[user.index.str.startswith('interest_')]
print(user[interest_cols])
updateClick(10)
print(user[interest_cols])
print(articles_topics.loc[10])

interest_EUPolitics          0.420677
interest_crimes              0.496259
interest_israelPalestine     0.423356
interest_immigration         0.312063
interest_sports              0.237629
interest_war                 0.393333
interest_climateChange       0.236670
interest_showArts            0.686719
interest_covid               0.640178
interest_britishBrexit       0.657202
interest_instAbuse           0.828107
interest_spaceTravel         0.831905
interest_protests            0.073675
interest_terrorism           0.510401
interest_USPolitics          0.531319
interest_naturalDisasters    0.418327
interest_elections           0.506276
interest_economy             0.577332
dtype: float64
interest_EUPolitics          0.420677
interest_crimes              0.496259
interest_israelPalestine     0.423356
interest_immigration         0.312063
interest_sports              0.241183
interest_war                 0.394996
interest_climateChange       0.236670
interest_showArts            0.6867

In [100]:
def updateDistress(ratedArticle_index, rating, weight = 0.05):
    #update user distress based on distressed articles
    #The weight corresponds to how fast the user distress scores change after one distress
    
    ratedArticle = articles_topics.loc[ratedArticle_index]
    
    #Make changes based on weight and rating
    topicsvalues = list(ratedArticle)
    rating = rating/100
    topicsratings = [topic * rating for topic in topicsvalues]
    
    #Get current distressed scores of user
    distressed_cols = user.index[user.index.str.startswith('distressed_')]
    distressedlist = np.array(user[distressed_cols])
    
    #Change distressed scores in user vector
    new_distressed = []
    for topicrating, distress in zip(topicsratings, distressedlist):
        if not topicrating == 0:
            newvalue = weight*topicrating + (1-weight)*distress
        else:
            newvalue = distress

        new_distressed.append(newvalue)
        
    user[distressed_cols] = new_distressed
    
    return None

distressed_cols = user.index[user.index.str.startswith('distressed_')]
print(user[distressed_cols])
updateDistress(10, 90)
print(user[distressed_cols])
print(articles_topics.loc[10])

distressed_EUPolitics          0.304928
distressed_crimes              0.439420
distressed_israelPalestine     0.308188
distressed_immigration         0.327681
distressed_sports              0.068188
distressed_war                 0.469855
distressed_climateChange       0.527174
distressed_showArts            0.099783
distressed_covid               0.400507
distressed_britishBrexit       0.121449
distressed_instAbuse           0.355942
distressed_spaceTravel         0.072246
distressed_protest             0.327971
distressed_terrorism           0.522754
distressed_USPolitics          0.307101
distressed_naturalDisasters    0.491304
distressed_elections           0.260580
distressed_economy             0.350580
dtype: float64
distressed_EUPolitics          0.304928
distressed_crimes              0.439420
distressed_israelPalestine     0.308188
distressed_immigration         0.327681
distressed_sports              0.080769
distressed_war                 0.453848
distressed_climateChange 