In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.display.max_rows = 20000
pd.options.display.max_columns = 50

In [2]:
rated = pd.read_json(r"C:\job\projects\mysite\movies\resources\df\rated.json")
wts = pd.read_json(r"C:\job\projects\mysite\movies\resources\df\wts.json")
role = pd.read_json(r"C:\job\projects\mysite\movies\resources\df\role.json")
actor = pd.read_json(r"C:\job\projects\mysite\movies\resources\df\actor.json")
country = pd.read_json(r"C:\job\projects\mysite\movies\resources\df\country.json")
genre = pd.read_json(r"C:\job\projects\mysite\movies\resources\df\genre.json")
genres = pd.read_json(r"C:\job\projects\mysite\movies\resources\df\genres.json")

In [3]:
rated.index.name = 'movie_id'
rated.columns = ['movie_id', 'title', 'rate', 'my_rate', 'rate_var', 'rate_count', 'critic_rate', 'year', 'award_oscar',
       'award', 'nomination', 'bo', 'bo_usa', 'bo_no_usa', 'budget', 'director']
wts.index.name = 'movie_id'
wts.columns = ['movie_id', 'title', 'rate', 'wts_rate', 'rate_count', 'critic_rate', 'year', 'award_oscar',
       'award', 'nomination', 'bo', 'bo_usa', 'bo_no_usa', 'budget', 'director']
role.index.name = 'role_id'
actor.index.name = 'actor_id'
genre.index.name = 'genre_id'
genres.columns = ['movie_id', 'genre', 'short', 'like']

In [4]:
rated.insert(5, 'genre_avg', genres.groupby('movie_id').mean().loc[rated.index])

In [5]:
rate_var_mean = rated.rate_var.mean()
rate_role_mean = role.rate.mean()
confidence_lvl_country = 75  # less cause results tending more to mean
confidence_lvl_genres = 75
confidence_lvl_role = 99
confidence_lvl_director = 70

In [6]:
country_mean = pd.DataFrame({'country': country.groupby('movie_id')['code'].apply(tuple)[rated.movie_id], 'rate_var': rated.rate_var}).groupby('country').agg(rate_var_mean=('rate_var', 'mean'), rate_var_count=('rate_var', 'count')).sort_values('rate_var_count', ascending=False)
# country_mean.count value where exceeds 80%
country_mean_count_confidence = country_mean.rate_var_count[(country_mean.rate_var_count.cumsum() / country_mean.rate_var_count.sum() * 100) > confidence_lvl_country].max()
country_mean_confidence = country_mean.rate_var_count / (country_mean.rate_var_count + country_mean_count_confidence)
country_mean['rate_var_adj'] = country_mean.rate_var_mean * country_mean_confidence + rate_var_mean * (1 - country_mean_confidence)

In [7]:
genres_mean = pd.DataFrame({'genres': genres.groupby('movie_id')['short'].apply(tuple)[rated.movie_id], 'rate_var': rated.rate_var}).groupby('genres').agg(rate_var_mean=('rate_var', 'mean'), rate_var_count=('rate_var', 'count')).sort_values('rate_var_count', ascending=False)
genres_mean_count_confidence = genres_mean.rate_var_count[
    (genres_mean.rate_var_count.cumsum() / genres_mean.rate_var_count.sum() * 100) > confidence_lvl_genres].max()
genres_mean_confidence = genres_mean.rate_var_count / (
        genres_mean.rate_var_count + genres_mean_count_confidence)
genres_mean['rate_var_adj'] = genres_mean.rate_var_mean * genres_mean_confidence + rate_var_mean * (
            1 - genres_mean_confidence)

In [8]:
role.sort_values('rate_count', ascending=False, inplace=True)
role_mean_count_confidence = role.rate_count[(role.rate_count.cumsum() / role.rate_count.sum() * 100) > confidence_lvl_role].max()
role_mean_confidence = role.rate_count / (role.rate_count + role_mean_count_confidence)
role['rate_adj'] = role.rate * role_mean_confidence + rate_role_mean * (1 - role_mean_confidence)

role_rates = pd.concat([rated[['rate']], wts[['rate']]]).merge(pd.concat([
    role[['movie_id', 'rate_adj']].sort_values(['movie_id', 'rate_adj'], ascending=[True, False]).groupby('movie_id').nth[:1].groupby('movie_id').mean().rename(columns={'rate_adj': 'mean_1'}),
    role[['movie_id', 'rate_adj']].sort_values(['movie_id', 'rate_adj'], ascending=[True, False]).groupby('movie_id').nth[:2].groupby('movie_id').mean().rename(columns={'rate_adj': 'mean_2'}),
    role[['movie_id', 'rate_adj']].sort_values(['movie_id', 'rate_adj'], ascending=[True, False]).groupby('movie_id').nth[:3].groupby('movie_id').mean().rename(columns={'rate_adj': 'mean_3'}),
    role[['movie_id', 'rate_adj']].sort_values(['movie_id', 'rate_adj'], ascending=[True, False]).groupby('movie_id').nth[:5].groupby('movie_id').mean().rename(columns={'rate_adj': 'mean_5'}),
]
    , axis=1, join='inner'
), left_index=True, right_index=True)
role_rates['mean_5_adj'] = role_rates.mean_5 - (-0.66 * role_rates.rate + 5.55) + rate_var_mean
role_rates['mean_5_adj_var'] = role_rates.mean_5_adj - role_rates.rate

In [9]:
actor = actor.merge(role.groupby('actor_id').agg(['mean', 'count']).rate_adj, how='left', left_index=True, right_index=True)
actor.dropna(inplace=True)

In [10]:
director_mean = rated[['director', 'rate_var']].groupby('director').agg(
    rate_var_mean=('rate_var', 'mean'), rate_var_count=('rate_var', 'count')).sort_values('rate_var_count', ascending=False)
director_mean_count_confidence = director_mean.rate_var_count[
    (director_mean.rate_var_count.cumsum() / director_mean.rate_var_count.sum() * 100) > confidence_lvl_director].max()
director_mean_confidence = director_mean.rate_var_count / (
        director_mean.rate_var_count + director_mean_count_confidence)
director_mean['rate_var_adj'] = director_mean.rate_var_mean * director_mean_confidence + rate_var_mean * (
            1 - director_mean_confidence)

In [11]:
rated_recommend = rated[['title', 'rate', 'my_rate', 'rate_var', 'director']].merge(
    director_mean[['rate_var_adj']].rename(columns={'rate_var_adj': 'director_var'}), left_on='director', right_index=True).merge(
    genres.groupby('movie_id')['short'].apply(tuple).to_frame().rename(columns={'short': 'genre'}), left_index=True, right_index=True).merge(
    genres_mean[['rate_var_adj']].rename(columns={'rate_var_adj': 'genres_var'}), left_on='genre', right_index=True).merge(
    country.groupby('movie_id')['code'].apply(tuple).to_frame().rename(columns={'code': 'country'}), left_index=True, right_index=True).merge(
    country_mean[['rate_var_adj']].rename(columns={'rate_var_adj': 'country_var'}), left_on='country', right_index=True)

In [12]:
country_weight = rated_recommend.corr().rate_var['country_var'] * 2
genres_weight = rated_recommend.corr().rate_var['genres_var'] * 2
director_weight = rated_recommend.corr().rate_var['director_var'] * 2

In [13]:
rate_var_recommend_rated = (rated_recommend.director_var * director_weight + rated_recommend.genres_var * genres_weight + rated_recommend.country_var * country_weight - 0.4).to_frame().rename(columns={0: 'rate_var_recommend'})
rated_recommend_var = pd.concat([rated_recommend[['title', 'rate', 'rate_var']],
                                 rate_var_recommend_rated,
                                 (rate_var_recommend_rated.rate_var_recommend - rated_recommend.rate_var).to_frame().rename(columns={0: 'rate_var_diff'}),
                                 rated_recommend[['director_var', 'genres_var', 'country_var']],
                                 ], axis=1)

In [14]:
wts_recommend = wts[['title', 'rate', 'director']].merge(
    director_mean[['rate_var_adj']].rename(columns={'rate_var_adj': 'director_var'}), left_on='director', right_index=True, how='left').merge(
    genres.groupby('movie_id')['short'].apply(tuple).to_frame().rename(columns={'short': 'genre'}), left_index=True, right_index=True).merge(
    genres_mean[['rate_var_adj']].rename(columns={'rate_var_adj': 'genres_var'}), left_on='genre', right_index=True, how='left').merge(
    country.groupby('movie_id')['code'].apply(tuple).to_frame().rename(columns={'code': 'country'}), left_index=True, right_index=True).merge(
    country_mean[['rate_var_adj']].rename(columns={'rate_var_adj': 'country_var'}), left_on='country', right_index=True, how='left').fillna(
    {'director_var': rate_var_mean, 'genres_var': rate_var_mean, 'country_var': rate_var_mean})
wts_recommend.insert(2, 'rate_var_recommend', wts_recommend.director_var * director_weight + wts_recommend.genres_var * genres_weight + wts_recommend.country_var * country_weight - 0.4)
wts_recommend.insert(2, 'rate_recommended', wts_recommend.rate + wts_recommend.rate_var_recommend)