In [1]:
import pandas as pd
from ast import literal_eval

#read data
df_credits = pd.read_csv('tmdb_5000_credits.csv')
df_movies = pd.read_csv('tmdb_5000_movies.csv')

df_credits.pop('title')
df_credits.columns = ['id', 'cast', 'crew']
df = df_credits.merge(df_movies, on='id')

features = ['keywords', 'genres', 'cast']

import re
charRe = re.compile(r'[^a-zA-Z0-9.]')

def clean_value(value):
    without_spaces = value.replace(" ", "")
    string = charRe.search(without_spaces)
    if not bool(string):
        return str.lower(without_spaces)
    else:
        return ""

def to_string_of_values(arr):
    values = [clean_value(obj['name']) for obj in literal_eval(arr)]
    return ' '.join(values)

for feature in features:
    key = f'{feature}_vector'
    df[key] = df[feature].copy().apply(to_string_of_values)

df[['title', 'keywords_vector', 'genres_vector', 'cast_vector']]

Unnamed: 0,title,keywords_vector,genres_vector,cast_vector
0,Avatar,cultureclash future spacewar spacecolony socie...,action adventure fantasy sciencefiction,samworthington zoesaldana sigourneyweaver step...
1,Pirates of the Caribbean: At World's End,ocean drugabuse exoticisland eastindiatradingc...,adventure fantasy action,johnnydepp orlandobloom keiraknightley billn...
2,Spectre,spy basedonnovel secretagent sequel mi6 britis...,action adventure crime,danielcraig christophwaltz ralphfiennes monic...
3,The Dark Knight Rises,dccomics crimefighter terrorist secretidentity...,action crime drama thriller,christianbale michaelcaine garyoldman annehath...
4,John Carter,basedonnovel mars medallion spacetravel prince...,action adventure sciencefiction,taylorkitsch lynncollins samanthamorton willem...
...,...,...,...,...
4798,El Mariachi,legs arms paperknife guitarcase,action crime thriller,carlosgallardo jaimedehoyos petermarquardt rei...
4799,Newlyweds,,comedy romance,edwardburns marshadietlein caitlinfitzgerald ...
4800,"Signed, Sealed, Delivered",date loveatfirstsight narration investigation ...,comedy drama romance tvmovie,ericmabius kristinbooth crystallowe geoffgusta...
4801,Shanghai Calling,,,danielhenney elizacoupe billpaxton alanruck zh...


In [2]:
ids = df['id'].to_list()

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

def vectorize(feature_name):
    feature = df[feature_name].to_list()
    vectorizer = CountVectorizer(feature)
    vector = vectorizer.fit_transform(feature)
    #df[feature_name] = pd.Series(vector.toarray().data.tolist())
    return vectorizer, vector

def visualize_vector(vectorizer, vector):
    features = vectorizer.get_feature_names()
    table = pd.DataFrame(vector.toarray(), columns=features)
    table = pd.DataFrame(df['title']).join(table)
    return table

In [4]:
genres_vectorizer, genres_vector = vectorize('genres_vector')
visualize_vector(genres_vectorizer, genres_vector)

Unnamed: 0,title,action,adventure,animation,comedy,crime,documentary,drama,family,fantasy,...,history,horror,music,mystery,romance,sciencefiction,thriller,tvmovie,war,western
0,Avatar,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,Pirates of the Caribbean: At World's End,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,Spectre,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Dark Knight Rises,1,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,John Carter,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,El Mariachi,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4799,Newlyweds,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4800,"Signed, Sealed, Delivered",0,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,0
4801,Shanghai Calling,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
keywords_vectorizer, keywords_vector = vectorize('keywords_vector')
visualize_vector(keywords_vectorizer, keywords_vector)

Unnamed: 0,title,15thcentury,16thcentury,17thcentury,18thcentury,1910s,1920s,1930s,1940s,1950s,...,zeppelin,zerogravity,zeus,zipline,zombie,zombieapocalypse,zombification,zoo,zookeeper,zurich
0,Avatar,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Pirates of the Caribbean: At World's End,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Spectre,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Dark Knight Rises,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,John Carter,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,El Mariachi,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4799,Newlyweds,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4800,"Signed, Sealed, Delivered",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4801,Shanghai Calling,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
cast_vectorizer, cast_vector = vectorize('cast_vector')
visualize_vector(cast_vectorizer, cast_vector)

Unnamed: 0,title,50cent,aadukalamnaren,aakomonjones,aaliyah,aamirkhan,aaranthomas,aaronabrams,aaronashmore,aaronau,...,zuleykasilver,zully,zullymontero,zumajay,zumelmichel,zuoxiaoqing,zupancic,zurijames,zveescooler,zydrunasilgauskas
0,Avatar,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Pirates of the Caribbean: At World's End,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Spectre,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Dark Knight Rises,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,John Carter,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,El Mariachi,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4799,Newlyweds,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4800,"Signed, Sealed, Delivered",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4801,Shanghai Calling,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
from sklearn.metrics.pairwise import cosine_similarity
genres_similarity = cosine_similarity(genres_vector, genres_vector)
keywords_similarity = cosine_similarity(keywords_vector, keywords_vector)
cast_similarity = cosine_similarity(cast_vector, cast_vector)

In [8]:
similarity = genres_similarity + keywords_similarity + cast_similarity

In [9]:
#read ratings
df_ratings = pd.read_csv('ratings_small.csv')
#select only ratings of existing movies
ids = df['id'].tolist()
df_ratings = df_ratings[df_ratings['movieId'].isin(ids)]
#get user ratings by id
user = df_ratings.loc[df_ratings['userId'] == 2]
user

Unnamed: 0,userId,movieId,rating,timestamp
26,2,62,3.0,835355749
30,2,153,4.0,835355441
31,2,161,3.0,835355493
32,2,165,3.0,835355441
33,2,168,3.0,835355710
35,2,186,3.0,835355664
38,2,223,1.0,835355749
40,2,235,3.0,835355664
41,2,248,3.0,835355896
42,2,253,4.0,835355511


In [10]:
from sklearn.model_selection import train_test_split

X = user[['userId', 'movieId']]
y = user['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [11]:
def knn(similarity, X_train, y_train, X_test, k=15):
    predictions = []
    for i in range(len(X_test)):
        neighbours = []
        test_id = X_test.iat[i, 1]
        test_index = ids.index(test_id)
        for j in range(len(X_train)):
            train_id = X_train.iat[j, 1]
            train_index = ids.index(train_id)
            distance = similarity[test_index][train_index]
            neighbours.append((distance, j))
        neighbours.sort(key = lambda value: value[0], reverse=True)
        neighbours = neighbours[0:k]
        rating = 0
        sim_sum = 0
        for n in range(k):
            sim, neighbour_id = neighbours[n]
            rating += sim * y_train.iat[neighbour_id]
            sim_sum += sim
        y_pred = rating / sim_sum
        predictions.append(y_pred)
    return predictions

y_pred_content = knn(similarity, X_train, y_train, X_test)

In [12]:
from sklearn import metrics

def calc_metrics(y_test, y_pred):
    rmse = metrics.mean_squared_error(y_test, y_pred, squared=False)
    mse = metrics.mean_squared_error(y_test, y_pred)
    mae = metrics.mean_absolute_error(y_test, y_pred)

    print('RMSE:', rmse)
    print('MSE:', mse)
    print('MAE:', mae)

In [13]:
calc_metrics(y_test, y_pred_content)

RMSE: 1.093457368597129
MSE: 1.1956490169393574
MAE: 0.826017271536759


In [14]:
df = df_ratings.rename(columns={'userId': 'user', 'movieId': 'item'})
df

Unnamed: 0,user,item,rating,timestamp
13,1,2105,4.0,1260759139
16,1,2294,2.0,1260759108
26,2,62,3.0,835355749
30,2,153,4.0,835355441
31,2,161,3.0,835355493
...,...,...,...,...
99983,671,4995,4.0,1064891537
99993,671,5902,3.5,1064245507
100004,1,111,5.0,1609284358
100005,1,111,5.0,1609286487


In [15]:
from surprise import Dataset
from surprise import Reader
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)
data

<surprise.dataset.DatasetAutoFolds at 0x7f70083a0190>

In [16]:
from surprise import KNNBasic
sim_options = {
    "name": "pearson",
    "user_based": False
}
algo = KNNBasic(sim_options=sim_options)

In [17]:
algo.fit(data.build_full_trainset())

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f700812d490>

In [18]:
id_pairs = zip(X_test['userId'], X_test['movieId'])
y_pred_collaborative = [algo.predict(uid = userId, iid = movieId)[3] for (userId, movieId) in id_pairs]

In [20]:
calc_metrics(y_test, y_pred_collaborative)

RMSE: 0.7371309061291823
MSE: 0.5433619727708294
MAE: 0.5852474097225288


In [23]:
results = pd.DataFrame(data={'content_based': y_pred_content, 'collaborative': y_pred_collaborative, 'actual': y_test})
results

Unnamed: 0,content_based,collaborative,actual
88,2.646901,3.368536,3.0
70,3.458766,3.581842,4.0
56,3.46957,3.451011,4.0
87,3.252984,3.417277,3.0
74,3.290616,3.327197,3.0
53,3.68247,2.49057,1.0
43,3.444521,3.660838,4.0
31,3.593391,3.247801,3.0
46,3.439054,2.901395,3.0
86,3.362098,3.3998,3.0


In [24]:
from sklearn import linear_model

X = results[['content_based', 'collaborative']]
y = results['actual']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

regression = linear_model.LinearRegression()
regression.fit(X_train, y_train)

y_pred = regression.predict(X_test)

In [25]:
calc_metrics(y_test, y_pred)

RMSE: 0.5669811612430354
MSE: 0.3214676372045009
MAE: 0.5131394675453876
