## News recommender using KNN.

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [2]:
## creating dataset, 0 represents missing values
num_movie, num_user = 10, 6

userlist = [f'user_{i}' for i in range(1, num_user+1)]
mvlist = [f'news_{i}' for i in range(1, num_movie+1)]
df = pd.DataFrame(np.random.randint(0, 5 , size=(num_movie, num_user)), columns=userlist, index=mvlist)
df

Unnamed: 0,user_1,user_2,user_3,user_4,user_5,user_6
news_1,0,4,4,1,2,1
news_2,0,1,1,3,0,4
news_3,3,0,1,0,1,2
news_4,2,2,2,0,0,3
news_5,0,1,3,1,1,1
news_6,1,4,4,3,0,0
news_7,3,4,0,2,4,4
news_8,4,0,1,4,2,0
news_9,1,4,3,3,3,1
news_10,4,0,2,1,1,2


In [3]:
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(df.values)
distances, indices = knn.kneighbors(df.values, n_neighbors=3)

In [4]:
# copy df
df1 = df.copy()

# find the nearest neighbors using NearestNeighbors(n_neighbors=3)
number_neighbors = 3
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(df.values)
distances, indices = knn.kneighbors(df.values, n_neighbors=number_neighbors)

# convert user_name to user_index
user_index = df.columns.tolist().index('user_4')

# t: movie_title, m: the row number of t in df
for m,t in list(enumerate(df.index)):
  
  # find movies without ratings by user_4
  if df.iloc[m, user_index] == 0:
    sim_movies = indices[m].tolist()
    movie_distances = distances[m].tolist()
    
    # Generally, this is the case: indices[3] = [3 6 7]. The movie itself is in the first place.
    # In this case, we take off 3 from the list. Then, indices[3] == [6 7] to have the nearest NEIGHBORS in the list. 
    if m in sim_movies:
      id_movie = sim_movies.index(m)
      sim_movies.remove(m)
      movie_distances.pop(id_movie) 

    # However, if the percentage of ratings in the dataset is very low, there are too many 0s in the dataset. 
    # Some movies have all 0 ratings and the movies with all 0s are considered the same movies by NearestNeighbors(). 
    # Then,even the movie itself cannot be included in the indices. 
    # For example, indices[3] = [2 4 7] is possible if movie_2, movie_3, movie_4, and movie_7 have all 0s for their ratings.
    # In that case, we take off the farthest movie in the list. Therefore, 7 is taken off from the list, then indices[3] == [2 4].
    else:
      sim_movies = sim_movies[:n_neighbors-1]
      movie_distances = movie_distances[:n_neighbors-1]
        
    # movie_similarty = 1 - movie_distance    
    movie_similarity = [1-x for x in movie_distances]
    movie_similarity_copy = movie_similarity.copy()
    nominator = 0

    # for each similar movie
    for s in range(0, len(movie_similarity)):
      
      # check if the rating of a similar movie is zero
      if df.iloc[sim_movies[s], user_index] == 0:

        # if the rating is zero, ignore the rating and the similarity in calculating the predicted rating
        if len(movie_similarity_copy) == (number_neighbors - 1):
          movie_similarity_copy.pop(s)
          
        else:
          movie_similarity_copy.pop(s-(len(movie_similarity)-len(movie_similarity_copy)))

      # if the rating is not zero, use the rating and similarity in the calculation
      else:
        nominator = nominator + movie_similarity[s]*df.iloc[sim_movies[s],user_index]

    # check if the number of the ratings with non-zero is positive
    if len(movie_similarity_copy) > 0:
      
      # check if the sum of the ratings of the similar movies is positive.
      if sum(movie_similarity_copy) > 0:
        predicted_r = nominator/sum(movie_similarity_copy)

      # Even if there are some movies for which the ratings are positive, some movies have zero similarity even though they are selected as similar movies.
      # in this case, the predicted rating becomes zero as well  
      else:
        predicted_r = 0

    # if all the ratings of the similar movies are zero, then predicted rating should be zero
    else:
      predicted_r = 0

  # place the predicted rating into the copy of the original dataset
    df1.iloc[m,user_index] = predicted_r

In [5]:
def recommend_news(user, num_recommended_news):

  print('The list of the news {} has read \n'.format(user))

  for m in df[df[user] > 0][user].index.tolist():
    print(m)
  
  print('\n')

  recommended_news = []

  for m in df[df[user] == 0].index.tolist():

    index_df = df.index.tolist().index(m)
    predicted_rating = df1.iloc[index_df, df1.columns.tolist().index(user)]
    recommended_news.append((m, predicted_rating))

  sorted_rm = sorted(recommended_news, key=lambda x:x[1], reverse=True)
  
  print('The list of the Recommended Movies \n')
  rank = 1
  for recommended_movie in sorted_rm[:num_recommended_news]:
    
    print('{}: {} - predicted rating:{}'.format(rank, recommended_news[0], recommended_news[1]))
    rank = rank + 1

In [6]:
recommend_news('user_4', 5)

The list of the news user_4 has read 

news_1
news_2
news_5
news_6
news_7
news_8
news_9
news_10


The list of the Recommended Movies 

1: ('news_3', 1) - predicted rating:('news_4', 1)
2: ('news_3', 1) - predicted rating:('news_4', 1)


In [7]:
# store the original dataset in 'df', and create the copy of df, df1 = df.copy().
def news_recommender(user, num_neighbors, num_recommendation):

  number_neighbors = num_neighbors

  knn = NearestNeighbors(metric='cosine', algorithm='brute')
  knn.fit(df.values)
  distances, indices = knn.kneighbors(df.values, n_neighbors=number_neighbors)

  user_index = df.columns.tolist().index(user)

  for m,t in list(enumerate(df.index)):
    if df.iloc[m, user_index] == 0:
      sim_news = indices[m].tolist()
      news_distances = distances[m].tolist()
    
      if m in sim_news:
        id_movie = sim_news.index(m)
        sim_news.remove(m)
        news_distances.pop(id_movie) 

      else:
        sim_news = sim_news[:n_neighbors-1]
        news_distances = news_distances[:n_neighbors-1]
           
      news_similarity = [1-x for x in news_distances]
      news_similarity_copy = news_similarity.copy()
      nominator = 0

      for s in range(0, len(news_similarity)):
        if df.iloc[sim_news[s], user_index] == 0:
          if len(news_similarity_copy) == (number_neighbors - 1):
            news_similarity_copy.pop(s)
          
          else:
            news_similarity_copy.pop(s-(len(news_similarity)-len(news_similarity_copy)))
            
        else:
          nominator = nominator + news_similarity[s]*df.iloc[sim_news[s],user_index]
          
      if len(news_similarity_copy) > 0:
        if sum(news_similarity_copy) > 0:
          predicted_r = nominator/sum(news_similarity_copy)
        
        else:
          predicted_r = 0

      else:
        predicted_r = 0
        
      df1.iloc[m,user_index] = predicted_r
    recommend_news(user, num_recommendation)

In [8]:
news_recommender(user='user_6', num_neighbors=7, num_recommendation=5)

The list of the news user_6 has read 

news_1
news_2
news_3
news_4
news_5
news_7
news_9
news_10


The list of the Recommended Movies 

1: ('news_6', 0) - predicted rating:('news_8', 0)
2: ('news_6', 0) - predicted rating:('news_8', 0)
The list of the news user_6 has read 

news_1
news_2
news_3
news_4
news_5
news_7
news_9
news_10


The list of the Recommended Movies 

1: ('news_6', 0) - predicted rating:('news_8', 0)
2: ('news_6', 0) - predicted rating:('news_8', 0)
The list of the news user_6 has read 

news_1
news_2
news_3
news_4
news_5
news_7
news_9
news_10


The list of the Recommended Movies 

1: ('news_6', 0) - predicted rating:('news_8', 0)
2: ('news_6', 0) - predicted rating:('news_8', 0)
The list of the news user_6 has read 

news_1
news_2
news_3
news_4
news_5
news_7
news_9
news_10


The list of the Recommended Movies 

1: ('news_6', 0) - predicted rating:('news_8', 0)
2: ('news_6', 0) - predicted rating:('news_8', 0)
The list of the news user_6 has read 

news_1
news_2
news_3
