In [1]:
import pandas
from ast import literal_eval
#read data
df_credits = pandas.read_csv('tmdb_5000_credits.csv')
df_movies = pandas.read_csv('tmdb_5000_movies.csv')

In [2]:
#preprocess text
df_credits.pop('title')
df_credits.columns = ['id', 'cast', 'crew']
df = df_credits.merge(df_movies, on='id')
df['overview'] = df['overview'].fillna('')

In [35]:
movie = df_movies.loc[df_movies['id'] == 2255]
literal_eval(movie['genres'].values.tolist()[0])

ValueError: malformed node or string: ['[{"id": 35, "name": "Comedy"}, {"id": 18, "name": "Drama"}, {"id": 10749, "name": "Romance"}]']

In [3]:
#extract first N values of each feature
features = ['keywords', 'genres', 'cast']
df_copy = pandas.DataFrame()

def head(arr, N = 3):
    if len(arr) > N:
        arr = arr[:N]
    return arr

def clean_value(value):
    return str.lower(value.replace(" ", ""))

def to_string_of_values(arr):
    values = [clean_value(obj['name']) for obj in head(literal_eval(arr))]
    return ' '.join(values)

for feature in features:
    df_copy[feature] = df[feature].copy().apply(to_string_of_values)

df_copy[features]

Unnamed: 0,keywords,genres,cast
0,cultureclash future spacewar,action adventure fantasy,samworthington zoesaldana sigourneyweaver
1,ocean drugabuse exoticisland,adventure fantasy action,johnnydepp orlandobloom keiraknightley
2,spy basedonnovel secretagent,action adventure crime,danielcraig christophwaltz léaseydoux
3,dccomics crimefighter terrorist,action crime drama,christianbale michaelcaine garyoldman
4,basedonnovel mars medallion,action adventure sciencefiction,taylorkitsch lynncollins samanthamorton
...,...,...,...
4798,unitedstates–mexicobarrier legs arms,action crime thriller,carlosgallardo jaimedehoyos petermarquardt
4799,,comedy romance,edwardburns kerrybishé marshadietlein
4800,date loveatfirstsight narration,comedy drama romance,ericmabius kristinbooth crystallowe
4801,,,danielhenney elizacoupe billpaxton


In [5]:
#combine features into item profile
def create_metadata_soup(row):
    string = ' '
    for feature in features:
        string += row[feature]
    return string
df_copy['profile'] = df_copy.apply(create_metadata_soup, axis=1)
df_copy['profile']

0        cultureclash future spacewaraction adventure ...
1        ocean drugabuse exoticislandadventure fantasy...
2        spy basedonnovel secretagentaction adventure ...
3        dccomics crimefighter terroristaction crime d...
4        basedonnovel mars medallionaction adventure s...
                              ...                        
4798     unitedstates–mexicobarrier legs armsaction cr...
4799     comedy romanceedwardburns kerrybishé marshadi...
4800     date loveatfirstsight narrationcomedy drama r...
4801                   danielhenney elizacoupe billpaxton
4802     obsession camcorder crushdocumentarydrewbarry...
Name: profile, Length: 4803, dtype: object

In [6]:
#create tfidf matrix on item profiles
from sklearn.feature_extraction.text import TfidfVectorizer
from services.preprocess_text import preprocess_text
vectoriser = TfidfVectorizer(analyzer=preprocess_text)
matrix = vectoriser.fit_transform(df_copy['profile'])
matrix

[nltk_data] Downloading package wordnet to /home/victoria/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/victoria/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<4803x14054 sparse matrix of type '<class 'numpy.float64'>'
	with 29503 stored elements in Compressed Sparse Row format>

In [7]:
marr = matrix.toarray().data
marr

<memory at 0x7f2419928ad0>

In [8]:
mlist = marr.tolist()

In [9]:
mseries = pandas.Series(mlist)
mseries

0       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
                              ...                        
4798    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4799    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4800    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4801    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4802    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Length: 4803, dtype: object

In [26]:
#extract profiles of individual items
df['profile'] = mseries

KeyError: "None of [Index(['title'], dtype='object')] are in the [index]"

In [11]:
#read ratings
df_ratings = pandas.read_csv('ratings_small.csv')
#select only ratings of existing movies
ids = df['id'].tolist()
df_ratings = df_ratings[df_ratings['movieId'].isin(ids)]

In [12]:
#get user ratings by id
user = df_ratings.loc[df_ratings['userId'] == 6]
user

Unnamed: 0,userId,movieId,rating,timestamp
451,6,111,4.0,1109258212
453,6,173,2.0,1109258228
458,6,1250,4.5,1108134284
459,6,1259,4.5,1109258196
463,6,1639,2.0,1109258179
464,6,1687,2.0,1109258281
467,6,1909,3.0,1108134344
468,6,2001,3.0,1108134289
472,6,2502,3.5,1108134291
494,6,8874,4.5,1108134521


In [13]:
baseline = user['rating'].mean()
baseline

3.3

In [14]:
#normalize ratings by substructing mean value
user['rating'] -= baseline
user[['movieId','rating']]

Unnamed: 0,movieId,rating
451,111,0.7
453,173,-1.3
458,1250,1.2
459,1259,1.2
463,1639,-1.3
464,1687,-1.3
467,1909,-0.3
468,2001,-0.3
472,2502,0.2
494,8874,1.2


In [15]:
#get list with ids of items with rating more than avarage
top_ratings = user.loc[user['rating'] > 0]
top_rated_ids = top_ratings['movieId'].tolist()
top_rated_ids

[111, 1250, 1259, 2502, 8874]

In [16]:
#get profiles of top rated items
top_rated_data = df[df['id'].isin(top_rated_ids)]
top_rated_data[['id', 'title', 'profile']]

Unnamed: 0,id,title,profile
217,1250,Ghost Rider,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
386,2502,The Bourne Supremacy,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1031,8874,My Best Friend's Wedding,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1748,1259,Notes on a Scandal,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1850,111,Scarface,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [17]:
#add user ratings into items profile
top_rated_data = pandas.merge(top_rated_data, top_ratings, left_on='id', right_on='movieId')
top_rated_data[['id', 'title', 'profile', 'rating']]

Unnamed: 0,id,title,profile,rating
0,1250,Ghost Rider,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.2
1,2502,The Bourne Supremacy,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.2
2,8874,My Best Friend's Wedding,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.2
3,1259,Notes on a Scandal,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.2
4,111,Scarface,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.7


In [18]:
#transform profile to numpy array
import numpy
top_rated_data['profile'] = top_rated_data['profile'].apply(lambda v: numpy.array(v))

In [19]:
#calc weighted item profiles = item profile * rating
top_rated_data['weighted_profile'] = top_rated_data['profile'] * top_rated_data['rating']
#calc user profile = sum of weighted item profiles / sum of ratings
user_profile = top_rated_data['weighted_profile'].sum() / top_rated_data['rating'].sum() 
user_profile

array([0., 0., 0., ..., 0., 0., 0.])

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
#calculate simularity of user profile with all items profiles
def calc_sim(movie):
    sim = cosine_similarity([user_profile], [movie])
    return sim[0][0]
df['similarity'] = df['profile'].apply(calc_sim)
df['similarity']

0       0.000000
1       0.000000
2       0.000000
3       0.016072
4       0.000000
          ...   
4798    0.012695
4799    0.000000
4800    0.002284
4801    0.000000
4802    0.000000
Name: similarity, Length: 4803, dtype: float64

In [21]:
#get ids of items that was not rated by user
rated_ids = user['movieId'].tolist()
not_rated = df[~df['id'].isin(rated_ids)]
#sort not rated by user items by similarity (the most similar at the top)
not_rated_sorted = not_rated.sort_values(by=['similarity'], ascending=False)
#get 10 most similar item to user profile
most_similar_items = not_rated_sorted.head(10)
most_similar_items[['id', 'title', 'similarity']]

Unnamed: 0,id,title,similarity
1442,6440,The Shipping News,0.3157
3378,14112,Auto Focus,0.165766
1291,24420,The Time Traveler's Wife,0.137324
1523,182,The Good German,0.135228
674,16577,Astro Boy,0.134731
872,1717,All the King's Men,0.133952
846,1792,Stuck on You,0.133217
500,584,2 Fast 2 Furious,0.130016
958,2116,Out of Time,0.12982
1098,6639,Love in the Time of Cholera,0.126947
