In [1]:
#read data
import pandas as pd
df_credits = pd.read_csv('tmdb_5000_credits.csv')
df_movies = pd.read_csv('tmdb_5000_movies.csv')
df_ratings = pd.read_csv('ratings_small.csv')

#preprocess text
df_credits.pop('title')
df_credits.columns = ['id', 'cast', 'crew']
df = df_credits.merge(df_movies, on='id')
#select only ratings of existing movies
ids = df['id'].tolist()
df_ratings = df_ratings[df_ratings['movieId'].isin(ids)]

#extract first N values of each feature
from ast import literal_eval
features = ['keywords', 'genres', 'cast']
df_copy = pd.DataFrame()

def head(arr, N = 3):
    if len(arr) > N:
        arr = arr[:N]
    return arr

def clean_value(value):
    return str.lower(value.replace(" ", ""))

def to_string_of_values(arr):
    values = [clean_value(obj['name']) for obj in head(literal_eval(arr))]
    return ' '.join(values)

for feature in features:
    df_copy[feature] = df[feature].copy().apply(to_string_of_values)

df_copy[features]

Unnamed: 0,keywords,genres,cast
0,cultureclash future spacewar,action adventure fantasy,samworthington zoesaldana sigourneyweaver
1,ocean drugabuse exoticisland,adventure fantasy action,johnnydepp orlandobloom keiraknightley
2,spy basedonnovel secretagent,action adventure crime,danielcraig christophwaltz léaseydoux
3,dccomics crimefighter terrorist,action crime drama,christianbale michaelcaine garyoldman
4,basedonnovel mars medallion,action adventure sciencefiction,taylorkitsch lynncollins samanthamorton
...,...,...,...
4798,unitedstates–mexicobarrier legs arms,action crime thriller,carlosgallardo jaimedehoyos petermarquardt
4799,,comedy romance,edwardburns kerrybishé marshadietlein
4800,date loveatfirstsight narration,comedy drama romance,ericmabius kristinbooth crystallowe
4801,,,danielhenney elizacoupe billpaxton


In [2]:
#combine features into item profile
def create_metadata_soup(row):
    string = ' '
    for feature in features:
        string += row[feature]
    return string
df['profile'] = df_copy.apply(create_metadata_soup, axis=1)
df['profile']

0        cultureclash future spacewaraction adventure ...
1        ocean drugabuse exoticislandadventure fantasy...
2        spy basedonnovel secretagentaction adventure ...
3        dccomics crimefighter terroristaction crime d...
4        basedonnovel mars medallionaction adventure s...
                              ...                        
4798     unitedstates–mexicobarrier legs armsaction cr...
4799     comedy romanceedwardburns kerrybishé marshadi...
4800     date loveatfirstsight narrationcomedy drama r...
4801                   danielhenney elizacoupe billpaxton
4802     obsession camcorder crushdocumentarydrewbarry...
Name: profile, Length: 4803, dtype: object

In [3]:
user = df_ratings.loc[df_ratings['userId'] == 2]
#calc mean rating for user
baseline = user['rating'].mean()
#normalize ratings by substructing mean value
user['rating'] -= baseline
user[['movieId','rating']]
#get items rated by user
rated = pd.merge(df, user, left_on='id', right_on='movieId')
rated[['id', 'title', 'rating', 'profile']]

Unnamed: 0,id,title,rating,profile
0,296,Terminator 3: Rise of the Machines,0.611111,savingtheworld artificialintelligence manvsma...
1,272,Batman Begins,-0.388889,himalaya martialarts dccomicsaction crime dra...
2,616,The Last Samurai,-0.388889,japan warcrimes senseofguiltdrama action wart...
3,585,"Monsters, Inc.",1.611111,monster infant energysupplyanimation comedy f...
4,314,Catwoman,0.611111,whiterussian sex dccomicsaction crimehalleber...
5,161,Ocean's Eleven,-0.388889,prison pickpocket stripclubthriller crimegeor...
6,364,Batman Returns,-0.388889,holiday corruption doublelifeaction fantasymi...
7,587,Big Fish,-0.388889,circus fathersonrelationship witchadventure f...
8,550,Fight Club,-0.388889,supportgroup dualidentity nihilismdramaedward...
9,497,The Green Mile,-0.388889,southernusa blackpeople mentallydisabledfanta...


In [4]:
#partion data on test and train
from sklearn.model_selection import train_test_split
#x_train, x_test, y_train, y_test = train_test_split(rated[['id', 'title', 'profile']], rated['rating'], 
#        test_size=0.2, random_state=123)
x_train = rated[['id', 'title', 'profile']].head(28)
y_train = rated['rating'].head(28)
x_test = rated[['id', 'title', 'profile']].tail(8)
y_test = rated['rating'].tail(8)

In [5]:
#create tfidf matrix on item profiles
from sklearn.feature_extraction.text import TfidfVectorizer
from services.preprocess_text import preprocess_text
vectoriser = TfidfVectorizer(analyzer=preprocess_text)
x_train_profile = vectoriser.fit_transform(x_train['profile'])
x_test_profile = vectoriser.transform(x_test['profile'])

[nltk_data] Downloading package wordnet to /home/victoria/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/victoria/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
test_profile_arr = x_test_profile.toarray().data
test_list = test_profile_arr.tolist()
test_series = pd.Series(test_list)
print(test_series[2])
x_test['profile'] = test_series
x_test[['title', 'profile']]

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


Unnamed: 0,title,profile
0,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [8]:
train_profile_arr = x_train_profile.toarray().data
train_list = train_profile_arr.tolist()
train_series = pd.Series(train_list)
x_train['profile'] = train_series
x_train[['title', 'profile']]

Unnamed: 0,title,profile
0,Terminator 3: Rise of the Machines,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Batman Begins,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,The Last Samurai,"[0.3413771003742664, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"Monsters, Inc.","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,Catwoman,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,Ocean's Eleven,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,Batman Returns,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,Big Fish,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.38372335419762543,..."
8,Fight Club,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,The Green Mile,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [9]:
#get list with ids of items with rating more than avarage
top_ratings = user.loc[user['rating'] > 0]
top_rated_ids = top_ratings['movieId'].tolist()
top_rated_ids

[153, 253, 261, 296, 314, 350, 454, 468, 480, 500, 508, 509, 585, 590, 592]

In [10]:
#get profiles of top rated items
top_rated_data = x_train[x_train['id'].isin(top_rated_ids)]
top_rated_data[['id', 'title', 'profile']]

Unnamed: 0,id,title,profile
0,296,Terminator 3: Rise of the Machines,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,585,"Monsters, Inc.","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,314,Catwoman,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
12,508,Love Actually,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
13,509,Notting Hill,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
15,350,The Devil Wears Prada,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.38372335..."
18,590,The Hours,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
22,454,Romeo + Juliet,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
26,253,Live and Let Die,"[0.35290869112496365, 0.0, 0.0, 0.0, 0.0, 0.0,..."
27,153,Lost in Translation,"[0.0, 0.0, 0.0, 0.0, 0.4472135954999579, 0.0, ..."


In [11]:
#add user ratings into items profile
top_rated_data = pd.merge(top_rated_data, top_ratings, left_on='id', right_on='movieId')
top_rated_data[['id', 'title', 'profile', 'rating']]

Unnamed: 0,id,title,profile,rating
0,296,Terminator 3: Rise of the Machines,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.611111
1,585,"Monsters, Inc.","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.611111
2,314,Catwoman,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.611111
3,508,Love Actually,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.611111
4,509,Notting Hill,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.611111
5,350,The Devil Wears Prada,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.38372335...",0.611111
6,590,The Hours,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.611111
7,454,Romeo + Juliet,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.611111
8,253,Live and Let Die,"[0.35290869112496365, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.611111
9,153,Lost in Translation,"[0.0, 0.0, 0.0, 0.0, 0.4472135954999579, 0.0, ...",0.611111


In [12]:
#transform profile to numpy array
import numpy
top_rated_data['profile'] = top_rated_data['profile'].apply(lambda v: numpy.array(v))

In [13]:
#calc weighted item profiles = item profile * rating
top_rated_data['weighted_profile'] = top_rated_data['profile'] * top_rated_data['rating']
#calc user profile = sum of weighted item profiles / sum of ratings
user_profile = top_rated_data['weighted_profile'].sum() / top_rated_data['rating'].sum() 
user_profile

array([0.02658901, 0.        , 0.        , 0.        , 0.03369418,
       0.        , 0.03369418, 0.02891066, 0.        , 0.        ,
       0.03015819, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.0313073 , 0.        , 0.07821208, 0.03020545,
       0.        , 0.02988726, 0.03020545, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.05468239, 0.        , 0.087964  ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.0313073 , 0.        , 0.        , 0.        ,
       0.        , 0.02785234, 0.        , 0.        , 0.02572019,
       0.        , 0.        , 0.        , 0.03020545, 0.02940863,
       0.        , 0.        , 0.02891066, 0.02940863, 0.07821208,
       0.        , 0.07821208, 0.        , 0.        , 0.        ,
       0.        , 0.02940863, 0.        , 0.        , 0.0313073 ,
       0.        , 0.        , 0.        , 0.09298225, 0.03020

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
#calculate simularity of user profile with all items profiles
def calc_sim(movie):
    sim = cosine_similarity([user_profile], [movie])
    return sim[0][0]
#x_train = x_train.loc[x_train['profile'] == x_train['profile']]
x_train['similarity'] = x_train['profile'].apply(calc_sim)
x_train[['id', 'title', 'similarity']], y_train

(     id                               title  similarity
 0   296  Terminator 3: Rise of the Machines    0.226810
 1   272                       Batman Begins    0.025293
 2   616                    The Last Samurai    0.024146
 3   585                      Monsters, Inc.    0.549221
 4   314                            Catwoman    0.200425
 5   161                      Ocean's Eleven    0.000000
 6   364                      Batman Returns    0.000000
 7   587                            Big Fish    0.000000
 8   550                          Fight Club    0.000000
 9   497                      The Green Mile    0.023357
 10  588                         Silent Hill    0.068426
 11  593                             Solaris    0.000000
 12  508                       Love Actually    0.289978
 13  509                        Notting Hill    0.346835
 14  165          Back to the Future Part II    0.068605
 15  350               The Devil Wears Prada    0.200425
 16  377           A Nightmare 

In [75]:
import numpy as np
from sklearn.linear_model import LinearRegression
model = LinearRegression()
X = x_train['similarity'].values.reshape((-1, 1))
y = np.array(y_train.values)
model.fit(X, y)
#y_pred = model.predict(x)
#x_train['similarity'] = x_train['similarity'].apply(lambda x: np.array([x]))
#y_train = y_train.apply(lambda x: np.array([x]))
#print(y_train)
#X = np.array(x_train['arr_similarity'])
#y = y_train.values

LinearRegression()