In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

%matplotlib inline

In [2]:
articles_ = pd.read_csv('articles_community.csv')
interactions_ = pd.read_csv('user-item-interactions.csv')
del articles_['Unnamed: 0']
del interactions_['Unnamed: 0']

## I. Exploratory Data Analysis

Take a look at the data and use your findings to fill in the dictionary below with the correct responses to show your understanding of the data.

In [3]:
articles_.head()

Unnamed: 0,doc_body,doc_description,doc_full_name,doc_status,article_id
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,Detect Malfunctioning IoT Sensors with Streami...,Live,0
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",Communicating data science: A guide to present...,Live,1
2,☰ * Login\r\n * Sign Up\r\n\r\n * Learning Pat...,Here’s this week’s news in Data Science and Bi...,"This Week in Data Science (April 18, 2017)",Live,2
3,"DATALAYER: HIGH THROUGHPUT, LOW LATENCY AT SCA...",Learn how distributed DBs solve the problem of...,DataLayer Conference: Boost the performance of...,Live,3
4,Skip navigation Sign in SearchLoading...\r\n\r...,This video demonstrates the power of IBM DataS...,Analyze NY Restaurant data using Spark in DSX,Live,4


In [4]:
interactions_.head()

Unnamed: 0,article_id,title,email
0,1430.0,"using pixiedust for fast, flexible, and easier...",ef5f11f77ba020cd36e1105a00ab868bbdbf7fe7
1,1314.0,healthcare python streaming application demo,083cbdfa93c8444beaa4c5f5e0f5f9198e4f9e0b
2,1429.0,use deep learning for image classification,b96a4f2e92d8572034b1e9b28f9ac673765cd074
3,1338.0,ml optimization using cognitive assistant,06485706b34a5c9bf2a0ecdac41daf7e7654ceb7
4,1276.0,deploy your python model as a restful api,f01220c46fc92c6e6b161b1849de11faacd7ccb2


In [5]:
# number of movies
print("The number of articles is {}.".format(articles_.shape[0]))

# number of ratings
print("The number of interactions is {}.".format(interactions_.shape[0]))

# unique users
print("The number of unique users is {}.".format(interactions_.email.nunique()))

# missing interactions
print("The number of missing interactions is {}.".format(int(interactions_.email.isnull().mean()*interactions_.shape[0])))

The number of articles is 1056.
The number of interactions is 45993.
The number of unique users is 5148.
The number of missing interactions is 17.


In [6]:
## No need to change the code here - this will be helpful for later parts of the notebook
# Run this cell to map the user email to a user_id column and remove the email column

def email_mapper():
    coded_dict = {}
    cter = 1
    email_encoded = []
    
    for val in interactions_['email']:
        if val not in coded_dict:
            coded_dict[val] = cter
            cter+=1
        
        email_encoded.append(coded_dict[val])
    return email_encoded

email_encoded = email_mapper()
#del interactions['email']
interactions_['user_id'] = email_encoded

# show header
interactions_

Unnamed: 0,article_id,title,email,user_id
0,1430.0,"using pixiedust for fast, flexible, and easier...",ef5f11f77ba020cd36e1105a00ab868bbdbf7fe7,1
1,1314.0,healthcare python streaming application demo,083cbdfa93c8444beaa4c5f5e0f5f9198e4f9e0b,2
2,1429.0,use deep learning for image classification,b96a4f2e92d8572034b1e9b28f9ac673765cd074,3
3,1338.0,ml optimization using cognitive assistant,06485706b34a5c9bf2a0ecdac41daf7e7654ceb7,4
4,1276.0,deploy your python model as a restful api,f01220c46fc92c6e6b161b1849de11faacd7ccb2,5
...,...,...,...,...
45988,1324.0,ibm watson facebook posts for 2015,d21b998d7a4722310ceeaa3c6aaa181a36db2d73,5146
45989,142.0,neural networks for beginners: popular types a...,d21b998d7a4722310ceeaa3c6aaa181a36db2d73,5146
45990,233.0,bayesian nonparametric models – stats and bots,4faeed980a7cd11e0f3cf2058cc04daa2ef11452,5147
45991,1160.0,analyze accident reports on amazon emr spark,abbf639ba05daa5249c520e290283a6d726ba78d,5148


In [7]:
interactions_.email.value_counts()

2b6c0f514c2f2b04ad3c4583407dccd0810469ee    364
77959baaa9895a7e2bdc9297f8b27c1b6f2cb52a    363
2f5c7feae533ce046f2cb16fb3a29fe00528ed66    170
a37adec71b667b297ed2440a9ff7dad427c7ac85    169
8510a5010a5d4c89f5b07baac6de80cd12cfaf93    160
                                           ... 
308db8b36ed6a332c2eba4db83f73d2ef9161d64      1
0e42ebc8e6a6d26a3adc4fa7684db69aaabe01e5      1
98247b6240b3831804b0e04f341fc77517a2caf7      1
8ff6bfc629e1eb4f9160a6b1d5aa25c171dbbbb1      1
d48906d5eb86921deb4bd075a68dbb32690b1fa6      1
Name: email, Length: 5148, dtype: int64

In [8]:
interactions_[interactions_.email=='2b6c0f514c2f2b04ad3c4583407dccd0810469ee']

Unnamed: 0,article_id,title,email,user_id
28,362.0,dsx: hybrid mode,2b6c0f514c2f2b04ad3c4583407dccd0810469ee,23
72,409.0,using github for project control in dsx,2b6c0f514c2f2b04ad3c4583407dccd0810469ee,23
97,409.0,using github for project control in dsx,2b6c0f514c2f2b04ad3c4583407dccd0810469ee,23
223,302.0,accelerate your workflow with dsx,2b6c0f514c2f2b04ad3c4583407dccd0810469ee,23
255,409.0,using github for project control in dsx,2b6c0f514c2f2b04ad3c4583407dccd0810469ee,23
...,...,...,...,...
23976,1162.0,analyze energy consumption in buildings,2b6c0f514c2f2b04ad3c4583407dccd0810469ee,23
24384,1162.0,analyze energy consumption in buildings,2b6c0f514c2f2b04ad3c4583407dccd0810469ee,23
24622,236.0,improving real-time object detection with yolo,2b6c0f514c2f2b04ad3c4583407dccd0810469ee,23
24750,302.0,accelerate your workflow with dsx,2b6c0f514c2f2b04ad3c4583407dccd0810469ee,23


In [9]:
describe_interactions=interactions_.groupby('article_id').count().sort_values('user_id', ascending=False).reset_index()
describe_interactions

Unnamed: 0,article_id,title,email,user_id
0,1429.0,937,937,937
1,1330.0,927,927,927
2,1431.0,671,671,671
3,1427.0,643,643,643
4,1364.0,627,627,627
...,...,...,...,...
709,1113.0,1,1,1
710,1119.0,1,1,1
711,984.0,1,1,1
712,1127.0,1,1,1


In [10]:
print('The number of unique users is : ',interactions_.user_id.nunique())
print('The maximum article across all interactions is: ',describe_interactions.max()['article_id'])
print('The average article across all interactions is: ',np.round(describe_interactions.user_id.mean(), 0))
print('The minimum article across all interactions is: ',describe_interactions.user_id.min())
print('The number of missing interactions is: ',int(interactions_.email.isnull().mean()*interactions_.shape[0]))
print('The number of articles in the dataset: ',articles_.shape[0])
print('The number of interactions in the dataset: ',interactions_.shape[0])

The number of unique users is :  5149
The maximum article across all interactions is:  1444.0
The average article across all interactions is:  64.0
The minimum article across all interactions is:  1
The number of missing interactions is:  17
The number of articles in the dataset:  1056
The number of interactions in the dataset:  45993


Dropping duplicates

In [11]:
sum(articles_.article_id.duplicated())

5

In [12]:
articles_.drop_duplicates(subset='article_id', inplace=True)

In [13]:
sum(articles_.article_id.duplicated())

0

## II. Rank Based Recommendations

**1. How To Find The Most Popular articles**

In [14]:
article_interactions = interactions_.groupby('article_id')['user_id']
avg_interactions = article_interactions.mean()
num_interactions = article_interactions.count()
title = pd.DataFrame(interactions_.title.value_counts()).reset_index()
# Add Dates
interaction_count_df = pd.DataFrame({'avg_interaction': avg_interactions, 'num_interactions': num_interactions})
#interaction_count_df = interaction_count_df.join(title)

article_recs = articles_.set_index('article_id').join(interaction_count_df)

# sort by top avg interaction and number of interactions
article_recs.sort_values(['avg_interaction', 'num_interactions'], ascending=False)


Unnamed: 0_level_0,doc_body,doc_description,doc_full_name,doc_status,avg_interaction,num_interactions
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
277,Skip navigation Sign in SearchLoading...\r\n\r...,This video shows you how to set up connections...,Work with Data Connections in DSX,Live,4109.285714,14.0
233,Homepage Stats and Bots Follow Sign in / Sign ...,An introduction to Bayesian Nonparametrics: th...,Bayesian Nonparametric Models – Stats and Bots,Live,4021.125000,16.0
96,Jump to navigation\r\n\r\n * Twitter\r\n * Lin...,Discover an open source machine learning platf...,Improving quality of life with Spark-empowered...,Live,3952.428571,7.0
60,RStudio Blog * Home\r\n\r\n * Subscribe to fee...,readr 1.0.0 is now available on CRAN. readr ma...,readr 1.0.0,Live,3914.812500,16.0
117,* Home\r\n * Community\r\n * Projects\r\n * Bl...,This post provides a brief summary of sample c...,Apache Spark™ 2.0: Migrating Applications,Live,3911.000000,10.0
...,...,...,...,...,...,...
1040,Although it is built around a JavaScript engin...,Although it is built around a JavaScript engin...,How I Stopped Worrying & Learned to Love the M...,Live,,
1041,Margriet Groenendijk Blocked Unblock Follow Fo...,Last week I attended the GeoPython conference ...,Mapping All the Things with Python – IBM Watso...,Live,,
1045,lA SPEED GUIDE TO REDIS LUA SCRIPTING\r\nShare...,Lua is a compact language which can be embedde...,A Speed Guide To Redis Lua Scripting,Live,,
1046,PouchDB-find is a new API and syntax that allo...,PouchDB uses MapReduce as its default search m...,A look under the covers of PouchDB-find,Live,,


In [15]:
def create_ranked_df(articles_, interactions_):
        '''
        INPUT
        movies - the articles dataframe
        reviews - the interactions dataframe
        
        OUTPUT
        ranked_articles - a dataframe with articles that are sorted by highest avg rating, more reviews
        and must have more than 4 ratings
        '''
        
        # Pull the average interactions and number of interactions for each article
        article_interactions = interactions_.groupby('article_id')['user_id']
        avg_interactions = article_interactions.mean()
        num_interactions = article_interactions.count()
        
        # Add Dates
        interaction_count_df = pd.DataFrame({'avg_interaction': avg_interactions, 'num_interactions': num_interactions})
        
        # merge with the movies dataset
        article_recs = articles_.set_index('article_id').join(interaction_count_df)

        # sort by top avg interaction and number of interactions
        ranked_articles = article_recs.sort_values(['avg_interaction', 'num_interactions'], ascending=False)

        # for edge cases - subset the article list to those with only 5 or more interactions
        ranked_articles = ranked_articles[ranked_articles['num_interactions'] > 4]
        
        return ranked_articles
    

def popular_recommendations(user_id, n_top, ranked_articles):
    '''
    INPUT:
    user_id - the user_id (str) of the individual you are making recommendations for
    n_top - an integer of the number recommendations you want back
    ranked_articles - a pandas dataframe of the already ranked articles based on avg rating, count, and time

    OUTPUT:
    top_articles - a list of the n_top recommended articles by article doc_full_name in order best to worst
    '''

    top_articles = list(ranked_articles['doc_full_name'][:n_top])

    return top_articles

In [16]:
ranked_articles = create_ranked_df(articles_, interactions_)

In [17]:
# Top 20 movies recommended for id 1
recs_20_for_1 = popular_recommendations('1', 20, ranked_articles)
recs_20_for_1

['Work with Data Connections in DSX',
 'Bayesian Nonparametric Models – Stats and Bots',
 'Improving quality of life with Spark-empowered machine learning',
 'readr 1.0.0',
 'Apache Spark™ 2.0: Migrating Applications',
 'Data science expert interview: Holden Karau',
 'Top analytics tools in 2016',
 'Data Science of Variable Selection',
 'Build a logistic regression model with WML & DSX',
 'Apache Spark @Scale: A 60 TB+ production use case',
 'Join and enrich data from multiple sources',
 'This Week in Data Science (July 26, 2016)',
 'Use data assets in a project using IBM Data Catalog',
 'Advancements in the Spark Community',
 'This Week in Data Science (November 01, 2016)',
 'xml2 1.0.0',
 'A guide to receptive field arithmetic for Convolutional Neural Networks',
 'Foundational Methodology for Data Science',
 'Finding the user in data science',
 '3 Scenarios for Machine Learning on Multicloud']

In [18]:
# Top 20 movies recommended by title
list(interactions_.groupby(by='title').count().sort_values(by='user_id', ascending=False).head(20).index)

['use deep learning for image classification',
 'insights from new york car accident reports',
 'visualize car data with brunel',
 'use xgboost, scikit-learn & ibm watson machine learning apis',
 'predicting churn with the spss random tree algorithm',
 'healthcare python streaming application demo',
 'finding optimal locations of new store using decision optimization',
 'apache spark lab, part 1: basic concepts',
 'analyze energy consumption in buildings',
 'gosales transactions for logistic regression model',
 'welcome to pixiedust',
 'customer demographics and sales',
 'total population by country',
 'deep learning with tensorflow course by big data university',
 'model bike sharing data with spss',
 'the nurse assignment problem',
 'classify tumors with machine learning',
 'analyze accident reports on amazon emr spark',
 'movie recommender system with spark machine learning',
 'putting a human face on machine learning']

In [19]:
# Top 5 movies recommended for id 53968
recs_5_for_53968 = popular_recommendations('53968', 5, ranked_articles)

# Top 100 movies recommended for id 70000
recs_100_for_70000 = popular_recommendations('70000', 100, ranked_articles)

# Top 35 movies recommended for id 43
recs_35_for_43 = popular_recommendations('43', 35, ranked_articles)

#### Part II: Adding Filters
Now that you have created a function to give back the title articles Who are we looking for, let's make it a bit more robust. Add arguments that will act as filters for the title.

In [20]:
def recs_filtered(formal_ed_str,lookfor=None):
    '''
    INPUT
        formal_ed_str - a string of one of the values from the title column
        lookfor - list of string Who are we looking for
    
    OUTPUT
        return res if the string is in lookfor
        return 0 otherwise
    
    '''
    if lookfor is not None:
        res = [formal_ed_str for x in lookfor if x in formal_ed_str] 
        if res:
            return res
        else:
            return 0
    return formal_ed_str

In [21]:
articles=[]
# Top 100 articles recommended for id 70000 with titles=[machine'] filter
recs_100_for_70000 = popular_recommendations('70000', 100, ranked_articles)
for i in recs_100_for_70000:
    recs_filtered_list=recs_filtered(i,['machine'])
    if recs_filtered_list!=0:
        if i not in articles:
            articles.append(i)
            print(recs_filtered_list)

['Improving quality of life with Spark-empowered machine learning']
['8 ways to turn data into value with Apache Spark machine learning']
['Apple, IBM add machine learning to partnership with Watson-Core ML coupling']
['Lifelong (machine) learning: how automation can help your models get smarter over time']


In [22]:
articles=[]
for i in range(0,interactions_.shape[0]):
    recs_filtered_list=recs_filtered(interactions_["title"][i],['deep','machine'])
    if recs_filtered_list!=0:
        if interactions_["article_id"][i] not in articles:
            articles.append(interactions_["article_id"][i])
            print(recs_filtered_list)

['use deep learning for image classification']
['classify tumors with machine learning']
['use xgboost, scikit-learn & ibm watson machine learning apis']
['apache spark lab, part 3: machine learning']
['putting a human face on machine learning']
['overfitting in machine learning: what it is and how to prevent it']
['use apache systemml and spark for machine learning']
['graph-based machine learning']
['python machine learning: scikit-learn tutorial']
['deep learning with tensorflow course by big data university']
['learn tensorflow and deep learning together and now!']
['rapidly build machine learning flows with dsx']
['using machine learning to predict baseball injuries']
['deep learning trends and an example']
['machine learning for everyone']
['deep learning from scratch i: computational graphs']
['movie recommender system with spark machine learning']
['deep learning with data science experience']
['ibm watson machine learning: get started']
['challenges in deep learning']
['awesom

## III. User-User Based Collaborative Filtering

In [23]:
describe_interactions=interactions_.groupby('article_id').count().sort_values('user_id', ascending=False).reset_index()
describe_interactions

Unnamed: 0,article_id,title,email,user_id
0,1429.0,937,937,937
1,1330.0,927,927,927
2,1431.0,671,671,671
3,1427.0,643,643,643
4,1364.0,627,627,627
...,...,...,...,...
709,1113.0,1,1,1
710,1119.0,1,1,1
711,984.0,1,1,1
712,1127.0,1,1,1


In [26]:
interactions_['rating'] = np.nan
article_ids = set(interactions_.article_id)
for id_article in article_ids:
    for i in range(0,interactions_.shape[0]):
        if interactions_.article_id[i] == id_article:
            interactions_['rating'][i] = describe_interactions[describe_interactions.article_id==id_article]['user_id']
interactions_.to_csv('Interactions.csv', encoding = 'utf-8')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [26]:
interactions_ = pd.read_csv('Interactions.csv')
del interactions_['Unnamed: 0']

In [27]:
interactions_.head()

Unnamed: 0,article_id,title,email,user_id,rating
0,1430.0,"using pixiedust for fast, flexible, and easier...",ef5f11f77ba020cd36e1105a00ab868bbdbf7fe7,1,336.0
1,1314.0,healthcare python streaming application demo,083cbdfa93c8444beaa4c5f5e0f5f9198e4f9e0b,2,614.0
2,1429.0,use deep learning for image classification,b96a4f2e92d8572034b1e9b28f9ac673765cd074,3,937.0
3,1338.0,ml optimization using cognitive assistant,06485706b34a5c9bf2a0ecdac41daf7e7654ceb7,4,382.0
4,1276.0,deploy your python model as a restful api,f01220c46fc92c6e6b161b1849de11faacd7ccb2,5,347.0


In [28]:
# Create user-by-item matrix
user_by_article = interactions_.groupby(['user_id', 'article_id'])['rating'].max().unstack()

In [29]:
print(user_by_article.shape[1])
print(user_by_article.shape[0])

714
5149


In [30]:
np.array(user_by_article.loc[1][user_by_article.loc[1].isnull() == False].index.values)

array([  43.,  109.,  151.,  268.,  310.,  329.,  346.,  390.,  494.,
        525.,  585.,  626.,  668.,  732.,  768.,  910.,  968.,  981.,
       1052., 1170., 1183., 1185., 1232., 1293., 1305., 1363., 1368.,
       1391., 1400., 1406., 1427., 1429., 1430., 1431., 1436., 1439.])

In [31]:
len(interactions_[interactions_.user_id==1])

47

In [32]:
len(interactions_[interactions_.user_id==1 & interactions_.article_id.isin([  43.,  109.,  151.,  268.,  310.,  329.,  346.,  390.,  494.,
        525.,  585.,  626.,  668.,  732.,  768.,  910.,  968.,  981.,
       1052., 1170., 1183., 1185., 1232., 1293., 1305., 1363., 1368.,
       1391., 1400., 1406., 1427., 1429., 1430., 1431., 1436., 1439.])])

47

In [33]:
# Create a dictionary with users and corresponding movies seen

def articles_watched(user_id):
    '''
    INPUT:
    user_id - the user_id of an individual as int
    OUTPUT:
    articles - an array of articles the user has watched
    '''
    articles = np.array(user_by_article.loc[user_id][user_by_article.loc[user_id].isnull() == False].index.values)
    
    i=0
    temp=[]
    while i < len(articles):
        temp.append(articles[i])
        i += 1
    articles=temp

    return articles


def create_user_article_dict():
    '''
    INPUT: None
    OUTPUT: articles_seen - a dictionary where each key is a user_id and the value is an array of article_ids
    
    Creates the articles_seen dictionary
    '''
    n_users = user_by_article.shape[0]
    articles_seen = {}

    for user1 in range(1, n_users+1):
        
        # assign list of movies to each user key
        articles_seen[user1] = articles_watched(user1)
    
    return articles_seen
    
articles_seen = create_user_article_dict()

In [34]:
# Remove individuals who have watched 2 or fewer articles - don't have enough data to make recs

def create_articles_to_analyze(articles_seen, lower_bound=2):
    '''
    INPUT:  
    articles_seen - a dictionary where each key is a user_id and the value is an array of articles_ids
    lower_bound - (an int) a user must have more movies seen than the lower bound to be added to the articles_to_analyze dictionary

    OUTPUT: 
    articles_to_analyze - a dictionary where each key is a user_id and the value is an array of article_ids
    
    The articles_seen and articles_to_analyze dictionaries should be the same except that the output dictionary has removed 
    
    '''
    articles_to_analyze = {}
    
    
    user=np.array( tuple(articles_seen.keys()) )
    articles = np.array( tuple(articles_seen.values()) )
    
    i=0
    temp=[]
    while i < len(articles):
        j = 0
        temp=[]
        while j < len(articles[i]):
            temp.append(articles[i][j])
            j += 1
        articles_to_analyze[user[i]]=temp
        i += 1
    
    #for user, articles in articles_seen.items():
        #if len(articles) > lower_bound:
            #articles_to_analyze[user] = articles
    return articles_to_analyze

articles_to_analyze = create_articles_to_analyze(articles_seen)

In [35]:
# Run the tests below to check that your articles_to_analyze matches the solution
print(len(articles_to_analyze)) 
print(len(articles_to_analyze[1]))
print(len(articles_to_analyze[8]))

5149
36
57


In [36]:
def compute_correlation(user1, user2):
    '''
    INPUT
    user1 - int user_id
    user2 - int user_id
    OUTPUT
    the correlation between the matching ratings between the two users
    '''
    # Pull movies for each user
    articles1 = articles_to_analyze[user1]
    articles2 = articles_to_analyze[user2]
    
    
    # Find Similar Movies
    sim_movs = np.intersect1d(articles1, articles2, assume_unique=True)
    
    # Calculate correlation between the users
    df = user_by_article.loc[(user1, user2), sim_movs]
    corr = df.transpose().corr().iloc[0,1]
    
    return corr #return the correlation

In [37]:
print(compute_correlation(2,2))
print(round(compute_correlation(2,66), 2))
print(np.isnan(compute_correlation(2,104)))

1.0
nan
True


correlation the spread in some ratings was zero

In [38]:
# Which articles did both user 2 and user 104 see?
set_2 = set(articles_to_analyze[2])
set_104 = set(articles_to_analyze[104])
set_2.intersection(set_104)

{1427.0}

In [39]:
# What were the ratings for each user on those articles?
print(user_by_article.loc[2, set_2.intersection(set_104)])
print(user_by_article.loc[104, set_2.intersection(set_104)])

article_id
1427.0    643.0
Name: 2, dtype: float64
article_id
1427.0    643.0
Name: 104, dtype: float64


In [40]:
def compute_euclidean_dist(user1, user2):
    '''
    INPUT
    user1 - int user_id
    user2 - int user_id
    OUTPUT
    the euclidean distance between user1 and user2
    '''
    # Pull movies for each user
    articles1 = articles_to_analyze[user1]
    articles2 = articles_to_analyze[user2]
    
    
    # Find Similar Movies
    sim_movs = np.intersect1d(articles1, articles2, assume_unique=True)
    
    # Calculate euclidean distance between the users
    df = user_by_article.loc[(user1, user2), sim_movs]
    dist = np.linalg.norm(df.loc[user1] - df.loc[user2])
    
    return dist #return the euclidean distance

In [41]:
print(compute_euclidean_dist(2,2))
print(round(compute_euclidean_dist(2,66), 2))
print(np.isnan(compute_euclidean_dist(2,104)))

0.0
0.0
False


# problem 

I tried to create a table to calculate the distance, but the values appear either 0 or nan and there are some values that have the fail result like for user2 is:

[7,18,25,29,39,47,53,62,84,93,99,101,106,116,123,124,127,128,130,141,142,147,161,162,174,175,177,179,191,201.....]

and I overlooked this problem by doing try, except.

Please, solution You are expected to reply, thanking you for your cooperation

In [42]:
users = []
for user1 in list(np.unique(interactions_.user_id)):
    if user1 != 2:
        users.append(user1)
 
#list of user1, user2, eucl_dist 
user1 = np.nan
user2 = np.unique(users)
eucl_dist = np.nan
  
# dictionary of lists  
dict = {'user1': user1, 'user2': user2, 'eucl_dist': eucl_dist}  
    
df_dists = pd.DataFrame(dict) 

df_dists['user1'] = 2

for i in range(0,df_dists.shape[0]):
    try:
        df_dists['eucl_dist'][i] = compute_euclidean_dist(2,int(df_dists['user2'][i]))
    except:
        pass
     
df_dists  


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,user1,user2,eucl_dist
0,2,1,0.0
1,2,3,0.0
2,2,4,0.0
3,2,5,0.0
4,2,6,0.0
...,...,...,...
5143,2,5145,0.0
5144,2,5146,0.0
5145,2,5147,0.0
5146,2,5148,0.0


In [43]:
df_dists.eucl_dist.value_counts()

0.0    5148
Name: eucl_dist, dtype: int64

In [44]:
def calculate_distance(user):
    users = []
    for user1 in list(np.unique(interactions_.user_id)):
        if user1 != user:
            users.append(user1)
    #list of user1, user2, eucl_dist 
    user1 = np.nan
    user2 = np.unique(users)
    eucl_dist = np.nan
    
    # dictionary of lists  
    dict = {'user1': user1, 'user2': user2, 'eucl_dist': eucl_dist}  
    df_dists = pd.DataFrame(dict) 
    
    df_dists['user1'] = user
    
    for i in range(0,df_dists.shape[0]):
        try:
            df_dists['eucl_dist'][i] = compute_euclidean_dist(user,df_dists['user2'][i])
        except:
            pass
    return df_dists
    

In [45]:
def find_closest_neighbors(user):
    '''
    INPUT:
        user - (int) the user_id of the individual you want to find the closest users
    OUTPUT:
        closest_neighbors - an array of the id's of the users sorted from closest to farthest away
    '''
    # I treated ties as arbitrary and just kept whichever was easiest to keep using the head method
    # You might choose to do something less hand wavy
    df_dists = calculate_distance(user)
    closest_users = df_dists[df_dists['user1']==user].sort_values(by='eucl_dist').iloc[1:]['user2']
    closest_neighbors = np.array(closest_users)
    
    return closest_neighbors
    
    
    
def articles_liked(user_id, min_rating=7):
    '''
    INPUT:
    user_id - the user_id of an individual as int
    min_rating - the minimum rating considered while still a movie is still a "like" and not a "dislike"
    OUTPUT:
    movies_liked - an array of movies the user has watched and liked
    '''
    articles_liked = np.array(interactions_.query('user_id == @user_id and rating > (@min_rating -1)')['article_id'])
    
        
    i=0
    articles_liked_temp=[]
    while i < len(articles_liked):
        articles_liked_temp.append(articles_liked[i])
        i += 1
    articles_liked=articles_liked_temp
    
    return articles_liked


def article_names(article_ids):
    '''
    INPUT
    article_ids - a list of article_ids
    OUTPUT
    articles - a list of article names associated with the article_ids
    
    '''
    article_lst = list(articles_[articles_['article_id'].isin(article_ids)]['doc_full_name'])
   
    return article_lst
    
    
def make_recommendations(user, num_recs=10):
    '''
    INPUT:
        user - (int) a user_id of the individual you want to make recommendations for
        num_recs - (int) number of movies to return
    OUTPUT:
        recommendations - a list of movies - if there are "num_recs" recommendations return this many
                          otherwise return the total number of recommendations available for the "user"
                          which may just be an empty list
    '''
    # I wanted to make recommendations by pulling different movies than the user has already seen
    # Go in order from closest to farthest to find movies you would recommend
    # I also only considered movies where the closest user rated the movie as a 9 or 10
    
    # movies_seen by user (we don't want to recommend these)
    articles_seen = articles_watched(user)
    closest_neighbors = find_closest_neighbors(user)
    
    # Keep the recommended movies here
    recs = []#np.array([])
    
    # Go through the neighbors and identify movies they like the user hasn't seen
    for neighbor in closest_neighbors:
        neighbs_likes = articles_liked(neighbor)
        #Obtain recommendations for each neighbor
        #new_recs = np.setdiff1d(neighbs_likes, articles_seen, assume_unique=True)
        new_recs = list(set(neighbs_likes).symmetric_difference(articles_seen))
        # Update recs with new recs
        for i in new_recs:
            if i not in recs:
                recs.append(i) 
        #recs = np.unique(np.concatenate([new_recs, recs], axis=0))
        
        # If we have enough recommendations exit the loop
        if len(recs) > num_recs-1:
            break
    # Pull movie titles using movie ids
    recommendations = article_names(recs)
    
    
    return recommendations

def all_recommendations(num_recs=10):
    '''
    INPUT 
        num_recs (int) the (max) number of recommendations for each user
    OUTPUT
        all_recs - a dictionary where each key is a user_id and the value is an array of recommended movie titles
    '''
    
    # All the users we need to make recommendations for
    users = np.unique(interactions_.user_id)
    n_users = len(users)
    
    #Store all recommendations in this dictionary
    all_recs = {}
    
    # Make the recommendations for each user
    for user in users:
        all_recs[user] = make_recommendations(user, num_recs)
    
    return all_recs

In [46]:
make_recommendations(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


['How to map USA rivers using ggplot2',
 'Simple Graphing with IPython and\xa0Pandas',
 'Web Picks (week of 23 January 2017)',
 'Using Deep Learning to Reconstruct High-Resolution Audio']

In [None]:
all_recs = all_recommendations(10)
all_recs[2]

# Users without recs
users_without_recs = []
for user, movie_recs in all_recs.items():
    if len(movie_recs) == 0:
        users_without_recs.append(user)
    
len(users_without_recs)

# NaN euclidean distance values
df_dists['eucl_dist'].isnull().sum()

# Users with fewer than 10 recs
users_with_less_than_10recs = []
for user, movie_recs in all_recs.items():
    if len(movie_recs) < 10:
        users_with_less_than_10recs.append(user)
    
len(users_with_less_than_10recs)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## IV. Content Based Recommendations (EXTRA - NOT REQUIRED)

In [47]:
import nltk
from nltk.corpus import stopwords

nltk.download(['punkt', 'wordnet'])
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\usamnet\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\usamnet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\usamnet\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [48]:
word_series = pd.Series(' '.join(articles_['doc_full_name']).lower().split())
top_words = word_series[~word_series.isin(stopwords.words("english"))].value_counts()[:100]
top_words_names = np.unique(np.array(top_words.index))
filtered_words_names = []
for i in top_words_names:
    if i not in ['&', '-', '2016)', '2017)','–','r','1', '10', '2', '3','—']:
        filtered_words_names.append(i)
print(filtered_words_names)

['algorithms', 'analysis', 'analytics', 'analyze', 'apache', 'api', 'app', 'application', 'applications', 'apps', 'bluemix', 'build', 'building', 'business', 'catalog', 'cloud', 'cloudant', 'compose', 'conference:', 'couchdb', 'create', 'customer:', 'dashdb', 'data', 'database', 'databases', 'datalayer', 'day', 'deep', 'dsx', 'elasticsearch', 'experience', 'first', 'geospatial', 'get', 'getting', 'graph', 'guide', 'ibm', 'index', 'introducing', 'json', 'jupyter', 'lab', 'learning', 'library', 'load', 'machine', 'making', 'maven:', 'medium', 'metrics', 'models', 'mongodb', 'moving', 'mysql', 'neural', 'new', 'node.js', 'notebook', 'notebooks', 'offline', 'open', 'part', 'postgresql', 'predict', 'python', 'queries', 'query', 'redis', 'rethinkdb', 'rstudio', 'science', 'search', 'service', 'seven', 'simple', 'spark', 'sql', 'started', 'storage', 'tutorial', 'use', 'using', 'visualization', 'warehouse', 'watson', 'web', 'week']


In [49]:
# and another
# Function to split and return values for columns
def split_words(val):
    try:
        if val.find(word) >-1:
            return 1
        else:
            return 0
    except AttributeError:
        return 0

# Apply function for each genre
for word in filtered_words_names:        
    articles_[word] = articles_['doc_description'].apply(split_words)

In [53]:
articles_.iloc[:,5:]

Unnamed: 0,algorithms,analysis,analytics,analyze,apache,api,app,application,applications,apps,...,started,storage,tutorial,use,using,visualization,warehouse,watson,web,week
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1051,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1052,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1053,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1054,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [56]:
# Subset so article_content is only using the dummy variables for each title content dummy columns
article_content = np.array(articles_.iloc[:,5:])

# Take the dot product to obtain a movie x article matrix of similarities
dot_prod_articles = article_content.dot(np.transpose(article_content))

In [82]:
# create checks for the dot product matrix
print(dot_prod_articles.shape[0])
print(dot_prod_articles.shape[1])
print(dot_prod_articles[0, 0] == np.max(dot_prod_articles[0]))

1051
1051
True


In [88]:
def find_similar_articles(article_id):
    '''
    INPUT
    movie_id - a article_id 
    OUTPUT
    similar_movies - an array of the most similar articles by doc_full_name
    '''
    # find the row of each article id
    article_idx = np.where(articles_['article_id'] == article_id)[0][0]
    
    # find the most similar article indices - to start I said they need to be the same for all content
    similar_idxs = np.where(dot_prod_articles[article_idx] == np.max(dot_prod_articles[article_idx]))[0]
    
    # pull the article doc_full_name based on the indices
    similar_articles = np.array(articles_.iloc[similar_idxs, ]['doc_full_name'])
    
    return similar_articles


cur_article=list(articles_['doc_full_name'][articles_['article_id']==3])
print(cur_article)

rec_articles = find_similar_articles(3)
new_recs = list(set(rec_articles).symmetric_difference(cur_article))
new_recs

['DataLayer Conference: Boost the performance of your distributed database']


['“Schemas” in CouchDB',
 'Build an iOS 8 App with Bluemix and the MobileFirst Platform for            iOS',
 'Building an Ordering Application with Watson AI and PostgreSQL: Part II',
 'Cleaning the swamp: Turn your data lake into a source of crystal-clear insight']