# Non-personalised Recommendations

## Import packages

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Get data

In [7]:
ratings = pd.read_csv('https://raw.githubusercontent.com/sidooms/MovieTweetings/master/latest/ratings.dat',sep='::',names=['user_id', 'movie_id', 'rating', 'rating_timestamp'],dtype={'user_id':'Int64', 'movie_id':'str', 'rating':'Int64', 'rating_timestamp':'Int64'})

  """Entry point for launching an IPython kernel.


In [8]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,rating_timestamp
0,1,114508,8,1381006850
1,2,499549,9,1376753198
2,2,1305591,8,1376742507
3,2,1428538,1,1371307089
4,3,75314,1,1595468524


In [10]:
genres = pd.read_csv('https://raw.githubusercontent.com/sidooms/MovieTweetings/master/latest/movies.dat',sep='::',names=['movie_id', 'title', 'genre'],dtype={'movie_id':'str', 'title':'str', 'genre':'str'},encoding='utf-8')

  """Entry point for launching an IPython kernel.


In [11]:
genres.head()

Unnamed: 0,movie_id,title,genre
0,8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short
1,10,La sortie des usines Lumière (1895),Documentary|Short
2,12,The Arrival of a Train (1896),Documentary|Short
3,25,The Oxford and Cambridge University Boat Race ...,
4,91,Le manoir du diable (1896),Short|Horror


## Clean data

In [12]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 898623 entries, 0 to 898622
Data columns (total 4 columns):
user_id             898623 non-null int64
movie_id            898623 non-null object
rating              898623 non-null int64
rating_timestamp    898623 non-null int64
dtypes: int64(3), object(1)
memory usage: 27.4+ MB


In [13]:
ratings.describe()

Unnamed: 0,user_id,rating,rating_timestamp
count,898623.0,898623.0,898623.0
mean,35304.105713,7.31676,1461045000.0
std,20351.697825,1.85298,70544750.0
min,1.0,0.0,1362062000.0
25%,17998.0,6.0,1396694000.0
50%,35136.0,8.0,1451326000.0
75%,52537.0,9.0,1516156000.0
max,70235.0,10.0,1617055000.0


In [19]:
ratings.groupby('rating').count()['user_id']

rating
0        276
1      10570
2       8950
3      15085
4      27516
5      67607
6     117059
7     201249
8     216821
9     127380
10    106110
Name: user_id, dtype: int64

In [14]:
genres.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36991 entries, 0 to 36990
Data columns (total 3 columns):
movie_id    36991 non-null object
title       36991 non-null object
genre       36991 non-null object
dtypes: object(3)
memory usage: 867.1+ KB


In [15]:
genres.describe()

Unnamed: 0,movie_id,title,genre
count,36991,36991,36991
unique,36991,36923,2799
top,5507860,Home (2016),Drama
freq,1,4,3836


In [16]:
genres[genres['title']=='Home (2016)']

Unnamed: 0,movie_id,title,genre
25057,2545384,Home (2016),Drama|Horror|Thriller
29205,4047846,Home (2016),Drama
32064,5593606,Home (2016),Short|Drama
32731,5969228,Home (2016),Short|Drama


There are multiple films in 2016 titled Home

In [23]:
genres.groupby('genre').count().sort_values('movie_id', ascending=False)['movie_id'].head(10)

genre
Drama                   3836
Comedy                  2234
Documentary             1563
Comedy|Drama            1426
Drama|Romance           1247
Comedy|Drama|Romance    1014
Comedy|Romance           878
Horror                   872
Horror|Thriller          592
Drama|Thriller           524
Name: movie_id, dtype: int64

In [31]:
def count_genres(list):
    genre_cnt = {}
    
    for item in genres['genre']:
        item_list = item.split('|')
        for i in item_list:
            if (i in genre_cnt):
                genre_cnt[i] += 1
            else:
                genre_cnt[i] = 1
                
    return genre_cnt

In [32]:
genre_count = count_genres(genres['genre'])

In [57]:
genre_count_df = pd.DataFrame.from_dict(genre_count, orient='index', columns=['count']).sort_values('count', ascending=False).reset_index()

In [58]:
genre_count_df.head()

Unnamed: 0,index,count
0,Drama,18821
1,Comedy,11526
2,Thriller,7782
3,Romance,6341
4,Action,5542


nan values are not filtered as these films can appear in general category

## Create charts

TOP10 charts for general category and all genres with at least a hundred films.
TOP10 is defined by number of ratings.

In [41]:
ratings_aggr = ratings.groupby('movie_id').count()['rating']

In [45]:
ratings_aggr = ratings_aggr.reset_index().sort_values('rating', ascending=False)

In [48]:
#genres[genres['genre'].str.contains('Short')]

In [53]:
#pd.merge(ratings_aggr, genres, on='movie_id')

In [54]:
def top_n_chart(movie_list,ratings_df, n=10):
    '''
    Returns a df: index, movie_id, movie_title, number_ratings
    '''
    
    return pd.merge(ratings_df, movie_list, on='movie_id').iloc[0:n]
    

In [56]:
top_n_chart(genres[genres['genre'].str.contains('Short')], ratings_aggr,10)

Unnamed: 0,movie_id,rating,title,genre
0,10367276,96,The Rat (2019),Short|Drama
1,3472226,90,Kung Fury (2015),Short|Action|Comedy|Fantasy|Sci-Fi
2,6073176,36,The White Helmets (2016),Documentary|Short|Crime|War
3,6620846,32,Wasati (2016),Short|Adventure|Comedy
4,5613056,31,Piper (2016),Animation|Short|Family
5,5262972,28,Avengers: Age of Ultron Parody (2015),Short|Comedy
6,56119,27,La jetée (1962),Short|Drama|Romance|Sci-Fi
7,2388725,27,Paperman (2012),Animation|Short|Comedy|Family|Romance
8,10516984,24,Anima (2019),Short|Music
9,8075496,23,Bao (2018),Animation|Short|Family|Fantasy


In [72]:
def create_chart_dict(genre_df, movie_list, ratings_df, n=10):
    chart_dict = {}
    
    # global list
    chart_dict['Global'] = top_n_chart(movie_list,ratings_df, n)
    
    # genre charts
    for gen in genre_df['index']:
        chart_dict[gen] = top_n_chart(movie_list[movie_list['genre'].str.contains(gen)],ratings_df, n)
    
    return chart_dict

In [73]:
chart_dict = create_chart_dict(genre_count_df, genres, ratings_aggr, n=10)

In [79]:
chart_dict['nan']

Unnamed: 0,movie_id,rating,title,genre
0,1199456,4,Eid Mubarak (1965),
1,166486,4,Arak el-balah (1998),
2,2424752,4,Belenggu (2012),
3,2857942,4,Tang Wong (2013),
4,2915232,3,Shotgun Garfunkel (2013),
5,8079546,2,Artist at Work: Annie Wood (2018),
6,210156,2,The Lion's Den (1998),
7,7064776,2,Khamis wa Jumah: Huroob Ijbari (2017),
8,1654082,2,Un mundo cuadrado (2011),
9,233905,2,Hysteria (1996),


Filtered dictionary for categories with at least a hundred films

In [77]:
popular_categories = genre_count_df[genre_count_df['count'] >= 100]['index']

In [80]:
popular_categories_dict = { pop_key: chart_dict[pop_key] for pop_key in popular_categories }

In [83]:
popular_categories_dict.pop('nan',None)

Unnamed: 0,movie_id,rating,title,genre
0,1199456,4,Eid Mubarak (1965),
1,166486,4,Arak el-balah (1998),
2,2424752,4,Belenggu (2012),
3,2857942,4,Tang Wong (2013),
4,2915232,3,Shotgun Garfunkel (2013),
5,8079546,2,Artist at Work: Annie Wood (2018),
6,210156,2,The Lion's Den (1998),
7,7064776,2,Khamis wa Jumah: Huroob Ijbari (2017),
8,1654082,2,Un mundo cuadrado (2011),
9,233905,2,Hysteria (1996),


In [84]:
popular_categories_dict.keys()

dict_keys(['Drama', 'Comedy', 'Thriller', 'Romance', 'Action', 'Crime', 'Horror', 'Documentary', 'Adventure', 'Mystery', 'Sci-Fi', 'Fantasy', 'Family', 'Biography', 'Short', 'History', 'Animation', 'War', 'Music', 'Sport', 'Musical', 'Western', 'Film-Noir', 'News'])

## Export charts

In [85]:
pwd

'C:\\Users\\Rendszergazda\\Documents\\GitHub\\recommander'

In [87]:
ls

 A meghajt˘ban (C) l‚v‹ k”tetnek nincs cˇmk‚je.
 A k”tet sorozatsz ma: 582B-63DE

 C:\Users\Rendszergazda\Documents\GitHub\recommander tartalma:

2021.03.30.  14:27    <DIR>          .
2021.03.30.  14:27    <DIR>          ..
2021.03.30.  11:06    <DIR>          .ipynb_checkpoints
2021.03.30.  11:52    <DIR>          charts
2021.03.20.  21:02    <DIR>          data
2021.03.30.  11:24            79˙927 Data Analysis.ipynb
2021.03.25.  20:30    <DIR>          moviegeek
2021.03.30.  14:27            39˙255 Non-personalised Recommendations.ipynb
2021.03.30.  11:11    <DIR>          solutions
               2 f jl             119˙182 b jt
               7 k”nyvt r  37˙830˙103˙040 b jt szabad


In [90]:
for chart_key in popular_categories_dict.keys():
    popular_categories_dict[chart_key].to_csv("{}/{}.csv".format('charts', chart_key), index=False)