In [1]:
# coding: utf-8

# # Assignment 3:  Recommendation systems
#
# Here we'll implement a content-based recommendation algorithm.
# It will use the list of genres for a movie as the content.
# The data come from the MovieLens project: http://grouplens.org/datasets/movielens/
# Note that I have not provided many doctests for this one. I strongly
# recommend that you write your own for each function to ensure your
# implementation is correct.


In [2]:
# Please only use these imports.
from collections import Counter, defaultdict
import math
import numpy as np
import os
import pandas as pd
import re
from scipy.sparse import csr_matrix
import urllib.request
import zipfile


In [3]:
def download_data():
    """ DONE. Download and unzip data.
    """
    url = 'https://www.dropbox.com/s/p9wmkvbqt1xr6lc/ml-latest-small.zip?dl=1'
    urllib.request.urlretrieve(url, 'ml-latest-small.zip')
    zfile = zipfile.ZipFile('ml-latest-small.zip')
    zfile.extractall()
    zfile.close()

In [4]:
def tokenize_string(my_string):
    """ DONE. You should use this in your tokenize function.
    """
    return re.findall('[\w\-]+', my_string.lower())

In [36]:
def tokenize(movies):
    """
    Append a new column to the movies DataFrame with header 'tokens'.
    This will contain a list of strings, one per token, extracted
    from the 'genre' field of each movie. Use the tokenize_string method above.

    Note: you may modify the movies parameter directly; no need to make
    a new copy.
    Params:
      movies...The movies DataFrame
    Returns:
      The movies DataFrame, augmented to include a new column called 'tokens'.

    >>> movies = pd.DataFrame([[123, 'Horror|Romance'], [456, 'Sci-Fi']], columns=['movieId', 'genres'])
    >>> movies = tokenize(movies)
    >>> movies['tokens'].tolist()
    [['horror', 'romance'], ['sci-fi']]
    """
    ###TODO
    token_list = []
    l = []
    for element in movies['genres']:
        if element != "(no genres listed)":
            token_list.append(tokenize_string(element))
        else:
            token_list.append(l)

    movies['tokens'] = token_list
    return movies

In [37]:
movies = pd.DataFrame([[123, 'Horror|Romance'], [456, 'Sci-Fi']], columns=['movieId', 'genres'])

In [38]:
movies = tokenize(movies)

In [39]:
movies['tokens'].tolist() == [['horror', 'romance'], ['sci-fi']]

True

In [40]:
featurize(movies)

(   movieId          genres             tokens  \
 0      123  Horror|Romance  [horror, romance]   
 1      456          Sci-Fi           [sci-fi]   
 
                                             features  
 0    (0, 0)\t0.3010299956639812\n  (0, 1)\t0.3010...  
 1                         (0, 2)\t0.3010299956639812  ,
 {'horror': 0, 'romance': 1, 'sci-fi': 2})

In [17]:
movies.head()

Unnamed: 0,movieId,title,genres,tokens
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[adventure, animation, children, comedy, fantasy]"
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[adventure, children, fantasy]"
2,3,Grumpier Old Men (1995),Comedy|Romance,"[comedy, romance]"
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[comedy, drama, romance]"
4,5,Father of the Bride Part II (1995),Comedy,[comedy]


In [14]:
download_data()
path = 'ml-latest-small'
ratings = pd.read_csv(path + os.path.sep + 'ratings.csv')
movies = pd.read_csv(path + os.path.sep + 'movies.csv')

In [18]:
len(movies)

9125

In [144]:
def featurize(movies): 
    """
    Append a new column to the movies DataFrame with header 'features'.
    Each row will contain a csr_matrix of shape (1, num_features). Each
    entry in this matrix will contain the tf-idf value of the term, as
    defined in class:
    tfidf(i, d) := tf(i, d) / max_k tf(k, d) * log10(N/df(i))
    where:
    i is a term
    d is a document (movie)
    tf(i, d) is the frequency of term i in document d
    max_k tf(k, d) is the maximum frequency of any term in document d
    N is the number of documents (movies)
    df(i) is the number of unique documents containing term i

    Params:
      movies...The movies DataFrame
    Returns:
      A tuple containing:
      - The movies DataFrame, which has been modified to include a column named 'features'.
      - The vocab, a dict from term to int. Make sure the vocab is sorted alphabetically as in a2 (e.g., {'aardvark': 0, 'boy': 1, ...})
    """ 
    ###TODO
    #number of movies
    N = len(movies)
    #list to store values of csr matrix
    csr_list = []
    token_list_ol = movies['tokens'].tolist()
    #number of unique genres sorted
    uniq_token_list = set()
    #genre and their frequency in entire column
    uniq_token_freq = defaultdict(lambda : 0)

    for element in token_list_ol:
        for value in element:
            if value not in uniq_token_list:
                #add genre to set if not present already 
                uniq_token_list.add(value)
            if value not in uniq_token_freq:
                uniq_token_freq[value] = 1
            else:
                uniq_token_freq[value] += 1
    
    #calculation for df
    df_list=[]
    for x in movies['tokens']:
        for i in set(x):
            df_list.append(i)
    doc_freq_c = Counter(df_list)
    doc_freq_c = dict(doc_freq_c) 
    print (doc_freq_c)

    #sorting the list and getting count of total number of features
    uniq_token_list = sorted(uniq_token_list)
    N = len(movies)

    #assigning index to each genre and forming a vocab
    vocab = {}
    for value in uniq_token_list:
        vocab[value] = len(vocab)
    num_features = len(vocab)

    #calculating tf, df and forming csr matrix
    for sublist in token_list_ol:
        #initilizations for csr calculations
        tfidf_list = []
        row_indices = []
        row_ptr = []
        #calculate tf
        tf_dict = defaultdict(lambda : 0)
        for value in sublist:
            if value not in tf_dict:
                tf_dict[value] = 1
            else:
                tf_dict[value] += 1
        
        for value in sublist:
            #calculate tf-idf values
            tf = tf_dict[value]
            df = doc_freq_c[value]
            max_k = max(tf_dict.values())
            
            print (tf)
            print (df)
            print (max_k)
            
            tfidf = (tf/max_k)* math.log10(N/df)
            tfidf_list.append(tfidf)
        #calculate csr
            index = vocab[value]
            row_indices.append(index)

        row_ptr = [0,len(row_indices)]
        csr_list.append(csr_matrix((tfidf_list,row_indices,row_ptr),shape=(1,num_features)))
        break
    movies['features'] = csr_list
    return(movies,vocab)
        

In [130]:
movies = pd.DataFrame([[123, 'Horror|Romance|Horror'], [456, 'Sci-Fi'],[789, 'Romance'], [100, '(no genres listed)'], [101, 'Comedy|Action'],[103, 'Horror|Romance'], [406, 'Sci-Fi']], columns=['movieId', 'genres'])
movies = tokenize(movies)

In [131]:
movies

Unnamed: 0,movieId,genres,tokens
0,123,Horror|Romance|Horror,"[horror, romance, horror]"
1,456,Sci-Fi,[sci-fi]
2,789,Romance,[romance]
3,100,(no genres listed),[]
4,101,Comedy|Action,"[comedy, action]"
5,103,Horror|Romance,"[horror, romance]"
6,406,Sci-Fi,[sci-fi]


In [137]:
movies1,vocab1 = featurize(movies)
movies1

{'horror': 2, 'romance': 3, 'sci-fi': 2, 'comedy': 1, 'action': 1}


Unnamed: 0,movieId,genres,tokens,features
0,123,Horror|Romance|Horror,"[horror, romance, horror]","(0, 2)\t0.5440680443502757\n (0, 3)\t0.1839..."
1,456,Sci-Fi,[sci-fi],"(0, 4)\t0.5440680443502757"
2,789,Romance,[romance],"(0, 3)\t0.36797678529459443"
3,100,(no genres listed),[],
4,101,Comedy|Action,"[comedy, action]","(0, 1)\t0.8450980400142568\n (0, 0)\t0.8450..."
5,103,Horror|Romance,"[horror, romance]","(0, 2)\t0.5440680443502757\n (0, 3)\t0.3679..."
6,406,Sci-Fi,[sci-fi],"(0, 4)\t0.5440680443502757"


In [138]:
vocab1

{'action': 0, 'comedy': 1, 'horror': 2, 'romance': 3, 'sci-fi': 4}

In [139]:
movies1

Unnamed: 0,movieId,genres,tokens,features
0,123,Horror|Romance|Horror,"[horror, romance, horror]","(0, 2)\t0.5440680443502757\n (0, 3)\t0.1839..."
1,456,Sci-Fi,[sci-fi],"(0, 4)\t0.5440680443502757"
2,789,Romance,[romance],"(0, 3)\t0.36797678529459443"
3,100,(no genres listed),[],
4,101,Comedy|Action,"[comedy, action]","(0, 1)\t0.8450980400142568\n (0, 0)\t0.8450..."
5,103,Horror|Romance,"[horror, romance]","(0, 2)\t0.5440680443502757\n (0, 3)\t0.3679..."
6,406,Sci-Fi,[sci-fi],"(0, 4)\t0.5440680443502757"


In [143]:
movies1['features'][0].toarray()

array([[0.        , 0.        , 1.08813609, 0.18398839, 0.        ]])

In [29]:
def train_test_split(ratings):
    """DONE.
    Returns a random split of the ratings matrix into a training and testing set.
    """
    test = set(range(len(ratings))[::1000])
    train = sorted(set(range(len(ratings))) - test)
    test = sorted(test)
    return ratings.iloc[train], ratings.iloc[test]

In [None]:
def cosine_sim(a, b):
    """
    Compute the cosine similarity between two 1-d csr_matrices.
    Each matrix represents the tf-idf feature vector of a movie.
    Params:
      a...A csr_matrix with shape (1, number_features)
      b...A csr_matrix with shape (1, number_features)
    Returns:
      A float. The cosine similarity, defined as: dot(a, b) / ||a|| * ||b||
      where ||a|| indicates the Euclidean norm (aka L2 norm) of vector a.
    """
    ###TODO
    pass