In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## load data

In [24]:
data = pd.read_csv('./movie_meta.csv')

In [25]:
data = data[['id','genres', 'vote_average', 'vote_count','popularity','title', 'overview', "tagline", "keywords"]]

In [32]:
data.shape

(2001, 9)

In [27]:
data.head()

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,overview,tagline,keywords
0,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",6.9,2413.0,17.015539,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
1,15602,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",6.5,92.0,11.7129,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
2,31357,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",6.1,34.0,3.859495,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
3,11862,"[{'id': 35, 'name': 'Comedy'}]",5.7,173.0,8.387519,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
4,949,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",7.7,1886.0,17.924927,Heat,"Obsessive master thief, Neil McCauley leads a ...",A Los Angeles Crime Saga,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '..."


## Create weighted score and sort top 20% of data

In [28]:
tmp_m = data['vote_count'].quantile(0.8)
tmp_m

262.0

In [29]:
tmp_data = data.copy().loc[data['vote_count'] >= tmp_m]
tmp_data.shape

(2001, 9)

In [30]:
del tmp_data

m = data['vote_count'].quantile(0.8)
data = data.loc[data['vote_count'] >= m]

In [31]:
data.head()

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,overview,tagline,keywords
0,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",6.9,2413.0,17.015539,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
4,949,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",7.7,1886.0,17.924927,Heat,"Obsessive master thief, Neil McCauley leads a ...",A Los Angeles Crime Saga,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '..."
8,710,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",6.6,1194.0,14.686036,GoldenEye,James Bond must unmask the mysterious head of ...,No limits. No fears. No substitutes.,"[{'id': 701, 'name': 'cuba'}, {'id': 769, 'nam..."
10,21032,"[{'id': 10751, 'name': 'Family'}, {'id': 16, '...",7.1,423.0,12.140733,Balto,An outcast half-wolf risks his life to prevent...,Part Dog. Part Wolf. All Hero.,"[{'id': 1994, 'name': 'wolf'}, {'id': 6411, 'n..."
13,524,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",7.8,1343.0,10.137389,Casino,The life of the gambling paradise – Las Vegas ...,No one stays at the top forever.,"[{'id': 383, 'name': 'poker'}, {'id': 726, 'na..."


In [33]:
C = data['vote_average'].mean()

In [34]:
print(C)
print(m)

6.585607196401798
262.0


In [35]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    
    return ( v / (v+m) * R ) + (m / (m + v) * C)

In [36]:
data.loc[:, 'score'] = data.apply(weighted_rating, axis = 1)

In [37]:
data.head()

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,overview,tagline,keywords,score
0,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",6.9,2413.0,17.015539,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,"[{'id': 10090, 'name': 'board game'}, {'id': 1...",6.869207
4,949,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",7.7,1886.0,17.924927,Heat,"Obsessive master thief, Neil McCauley leads a ...",A Los Angeles Crime Saga,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '...",7.564073
8,710,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",6.6,1194.0,14.686036,GoldenEye,James Bond must unmask the mysterious head of ...,No limits. No fears. No substitutes.,"[{'id': 701, 'name': 'cuba'}, {'id': 769, 'nam...",6.59741
10,21032,"[{'id': 10751, 'name': 'Family'}, {'id': 16, '...",7.1,423.0,12.140733,Balto,An outcast half-wolf risks his life to prevent...,Part Dog. Part Wolf. All Hero.,"[{'id': 1994, 'name': 'wolf'}, {'id': 6411, 'n...",6.903254
13,524,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",7.8,1343.0,10.137389,Casino,The life of the gambling paradise – Las Vegas ...,No one stays at the top forever.,"[{'id': 383, 'name': 'poker'}, {'id': 726, 'na...",7.601763


In [39]:
data.shape

(2001, 10)

## Extract genres and keywords as string (dic to str)

In [40]:
data.loc[:,('genres', 'keywords')] = data[['genres', 'keywords']].astype("string")

In [41]:
data = data.dropna()

In [42]:
data.loc[:,'genres'] = data['genres'].apply(literal_eval)
data.loc[:,'keywords'] = data['keywords'].apply(literal_eval)

In [43]:
data.loc[:,'genres'] = data['genres'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : " ".join(x))
data.loc[:,'keywords'] = data['keywords'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : " ".join(x))

In [45]:
data.head()

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,overview,tagline,keywords,score
0,8844,Adventure Fantasy Family,6.9,2413.0,17.015539,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,board game disappearance based on children's b...,6.869207
4,949,Action Crime Drama Thriller,7.7,1886.0,17.924927,Heat,"Obsessive master thief, Neil McCauley leads a ...",A Los Angeles Crime Saga,robbery detective bank obsession chase shootin...,7.564073
8,710,Adventure Action Thriller,6.6,1194.0,14.686036,GoldenEye,James Bond must unmask the mysterious head of ...,No limits. No fears. No substitutes.,cuba falsely accused secret identity computer ...,6.59741
10,21032,Family Animation Adventure,7.1,423.0,12.140733,Balto,An outcast half-wolf risks his life to prevent...,Part Dog. Part Wolf. All Hero.,wolf dog-sledding race alaska dog goose bear a...,6.903254
13,524,Drama Crime,7.8,1343.0,10.137389,Casino,The life of the gambling paradise – Las Vegas ...,No one stays at the top forever.,poker drug abuse 1970s overdose illegal prosti...,7.601763


In [46]:
data.to_csv('./movie_preprocessed.csv', index = False)