# Introduction
- We will be doing content based filtering
- A user vector can be created using the information in filter>Keywords and viewed articles
- This user vector can be compared with the vectors of the items
- Nearest neighbours algorithm can be used to determine the most similar items for suggestion

# Flow of logic

## Ingesting data
- There are three tables which overlap on different keys
- Checking the extent of overlap
## Feature Engineering
- Creating a column containing words that identify the user
  - Extracting the keywords from user['filters'] and saving them as strings
  - Merging user intereaction and items information
    - Pulling in the details of all the items viewed by user
    - Combining all the details into a single sring
    - Combining the keywords with the details of viewed items
## Recommender system
- Using a spacy tokenizer to convert the strings into a vector
- Using the nearest neighbour algorithm with the cosine metric to extract the 10 closest neighbours

# Ingesting data

In [None]:
import pandas as pd

In [None]:
items = pd.read_csv(
filepath_or_buffer='data/dataScienceAssignment/items.csv',\
# This is the file path of the csv
# Give the column number that is to be used as index
#index_col=,\
# Give the row numbers that have to be dropped when reading a data file
#skiprows=,\
# na_values defaults include NaN, nan, null, n/a,
na_values=['','nan'],\
# Get the column names from a header row
header=0,\
    encoding='utf-8'
)

In [None]:
userint = pd.read_csv(
filepath_or_buffer='data/dataScienceAssignment/userInteractions.csv',\
# This is the file path of the csv
# Give the column number that is to be used as index
#index_col=,\
# Give the row numbers that have to be dropped when reading a data file
#skiprows=,\
# na_values defaults include NaN, nan, null, n/a,
na_values=['','nan'],\
# Get the column names from a header row
header=0,\
    encoding='utf-8'
)

In [None]:
user = pd.read_csv(
filepath_or_buffer='data/dataScienceAssignment/users.csv',\
# This is the file path of the csv
# Give the column number that is to be used as index
#index_col=,\
# Give the row numbers that have to be dropped when reading a data file
#skiprows=,\
# na_values defaults include NaN, nan, null, n/a,
na_values=['','nan'],\
# Get the column names from a header row
header=0,\
    encoding='utf-8'
)

## Viewing data

In [None]:
user.info()

In [None]:
userint.info()

In [None]:
userint.head()

In [None]:
userint[userint.duplicated('entityId')]

In [None]:
userint[userint.duplicated('accountId')]

### Checking that there is an overlap between user and userint

In [None]:
user.loc[userint['accountId'].isin(user['accountId'].values)]

In [None]:
userint.loc[userint['accountId'].isin(user['accountId'].values)]

In [None]:
items.loc[items['id'].isin(userint['entityId'].values)]

# Feature engineering

## User keywords

In [None]:
user.head()

In [None]:
user['filter'][20]

In [None]:
user['filter'].fillna('{}',inplace=True)

In [None]:
from ast import literal_eval

In [None]:
def user_keywords(rowitem):
    pylist = literal_eval(rowitem)
    if 'KEYWORD' in pylist.keys():
        keywords = pylist['KEYWORD']
        string = ' '.join(keywords)
        return string
    else:
        return ''

In [None]:
user['filter'] = user['filter'].map(user_keywords)

## Items 

In [None]:
items.head()

In [None]:
items['details'].fillna('',inplace=True)

In [None]:
items['details'].eq('').sum()

In [None]:
itemfeatures = items[items['details'] != ''][['id','details']]

In [None]:
itemfeatures

## Create user features

In [None]:
userint.head()

Combining the user interactions and user tables to created a combined table.
It appears that some of the entities in the 'user interactions' don't exist on 'items'

In [None]:
userintitems = pd.merge(userint, items, how="left", left_on='entityId', right_on='id')[['accountId','details']]

In [None]:
userintitems['details']

In [None]:
userintitems['details'] = userintitems['details'].fillna('')

Converting all the items viewed by the user into a list

In [None]:
userintitems = userintitems.groupby('accountId')['details'].apply(list)

In [None]:
userintitems

Combining the viewed item list and the user list to create one data frame for modeling

In [None]:
userfeatures = pd.merge(userintitems, user, on="accountId")[['accountId','details','filter']]

In [None]:
userfeatures

Combining the details column into a single string

In [None]:
import numpy as np

In [None]:
def itemdetails(rowitem):
    #print(type(rowitem))
    detailsstring = ''
    for item in rowitem:
        #item = literal_eval(item)
        if not type(item)==float:
            detailsstring = detailsstring + ' ' + item
    return detailsstring
    

In [None]:
userfeatures['viewed'] = userfeatures['details'].apply(itemdetails)

In [None]:
userfeatures.head()

Combining the strings keywords and viewed items strings to create one main string

In [None]:
userfeatures['finalfeatures'] = userfeatures['filter'] + userfeatures['viewed']

In [None]:
userfeatures['finalfeatures'].eq('').sum()

In [None]:
userfeatures.head()

In [None]:
features = userfeatures[userfeatures['finalfeatures']!=''].drop(columns=['details','filter', 'viewed'])

# Recommender

## Converting to vectors

Creating the tokenizer

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
def spacy_tokenizer(document):
    tokens = nlp(document)
    tokens = [token.lemma_.lower() for token in tokens if (
        token.is_stop == False and \
        token.is_punct == False and \
        token.lemma_.strip()!= '')]
    return tokens

Calling the count vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer(input = 'content', tokenizer = spacy_tokenizer)

In [None]:
items['details'].count()

Training the count vectorizer and transforming items

In [None]:
train_features = vector.fit_transform(items['details'].iloc[0:500].values)

In [None]:
train_features

Evaluating the nearest neighbours for a given user id

In [None]:
from sklearn.neighbors import NearestNeighbors

In [None]:
from scipy.sparse import vstack, csr_matrix


In [None]:
def similar(userid):
    user_vector = vector.transform(features.loc[features['accountId']==userid]['finalfeatures'].values)
    combined = vstack([user_vector,train_features])
    nbrs = NearestNeighbors(n_neighbors=10, metric='cosine').fit(combined)                                                                                                                        
    _, indices = nbrs.kneighbors(combined)    
    ids = []
    for i in indices[0][1:]:
        ids.append(items['id'].iloc[i-1])
    return ids

In [52]:
similar('6076439132127629518')

['7fd4658b-74a5-4978-87f1-33a24fa36c42',
 '9eb6e662-146f-4d5a-859d-40db8cddb4ec',
 'aa29a50c-47c8-4f54-81e2-dca5f7e7a5d6',
 '71daccba-3e8a-471e-a4d2-acfafa4a5b8f',
 'ff6a216f-4957-45b2-bc9f-3b9862d85a1b',
 '271d6ff7-3d09-4a84-993d-2d51a04a66f1',
 '0d98ce4d-fdb1-40a8-967d-dcc5c3db57ac',
 '4fe60e4b-15b0-4fb8-9da6-3cbf78bce866',
 'c0d6f47f-2fdd-4ec4-bb88-6abb9d74b3a4']