# MovieLens dataset report
## Prepare

In [None]:
from movielens_analysis import Movies, Links, Ratings, Tags, Statistics

: 

In [None]:
%ls
%ls ml-latest-small

: 

In [None]:
MOVIES_CSV = 'ml-latest-small/movies.csv'
LINKS_CSV = 'ml-latest-small/links.csv'
RATINGS_CSV = 'ml-latest-small/ratings.csv'
TAGS_CSV = 'ml-latest-small/tags.csv'

: 

## Movies analysis

### Distribution by release year

In [None]:
movies = Movies(MOVIES_CSV)

: 

In [None]:
%timeit movies.dist_by_release()

: 

In [None]:
dist_by_release = movies.dist_by_release()

tmp = list(dist_by_release.items())
for index in range(len(tmp)):
    print(f'{tmp[index][0]} : {tmp[index][1]}', end='\t')
    index += 1
    if index % 5 == 0:
        print()

print('\n')

: 

### Distribution of genres

In [None]:
%timeit movies.dist_by_genres()

: 

In [None]:
movies = Movies(MOVIES_CSV)

for key, value in movies.dist_by_genres().items():
    print(f'{key} : {value}')

print()

: 

### 30 most genres films

In [None]:
%timeit movies.most_genres(30)

: 

In [None]:
movies = Movies(MOVIES_CSV)

print(f'{"Film":<70}Ganres number')
for key, value in movies.most_genres(30).items():
    print(f'{key:<70}{value}')

print()

: 

## Links analysis

### Get imdb information

In [None]:
links = Links(LINKS_CSV, movies)

: 

In [None]:
%timeit links.get_imdb([1, 3, 5, 7, 15], ['Director', 'Budget', 'Gross worldwide', 'Runtime'])

: 

In [None]:
fields = ['Director', 'Budget', 'Gross worldwide', 'Runtime']
imdb_info = links.get_imdb([1, 3, 5, 7, 15], fields)

print('MovieId', *fields, sep='\t\t')
for movie in imdb_info:
    print(*movie, sep='\t\t')

: 

### Top directors

In [None]:
%timeit -r 1 -n 1 links.top_directors(20)

: 

In [None]:
top_directors = links.top_directors(20)

print(f'{"Director":<20}Films count')
for key, value in top_directors.items():
    if key is None:
        key = 'Null'
    print(f'{key:<20}{value}')

: 

### Most expensive films

In [None]:
%timeit links.most_expensive(20)

: 

In [None]:
most_expensive = links.most_expensive(20)

print(f'{"Film":<40}Budget')
for key, value in most_expensive.items():
    print(f'{key:<40}{value}')

: 

### Most profitable films

In [None]:
%timeit links.most_profitable(20)

: 

In [None]:
most_profitable = links.most_profitable(20)

print(f'{"Film":<40}Profit')
for key, value in most_profitable.items():
    print(f'{key:<40}{value}')

: 

### Longest films

In [None]:
%timeit links.longest(20)

: 

In [None]:
longest = links.longest(20)

print(f'{"Film":<40}Runtime')
for key, value in longest.items():
    print(f'{key:<40}{value}')

: 

### Cost per unit top

In [None]:
%timeit links.top_cost_per_minute(20)

: 

In [None]:
top_cost_per_minute = links.top_cost_per_minute(20)

print(f'{"Film":<40}Cost per minute')
for key, value in top_cost_per_minute.items():
    print(f'{key:<40}{value}')

: 

## Ratings.Movies analysis
### Distribution of ratings count by year

In [None]:
ratings = Ratings(RATINGS_CSV)
movies_ratings = Ratings.Movies(ratings, movies)

: 

In [None]:
%timeit movies_ratings.dist_by_year()

: 

In [None]:
dist_by_year = movies_ratings.dist_by_year()

print(f'{"Year":<6}Ratings count')
for key, value in dist_by_year.items():
    print(f'{key:<6}{value}')

: 

### Distribution of ratings count by rating value

In [None]:
%timeit movies_ratings.dist_by_rating()

: 

In [None]:
dist_by_rating = movies_ratings.dist_by_rating()

print(f'{"Rating value":<15}Ratings count')
for key, value in dist_by_rating.items():
    print(f'{key:<15}{value}')

: 

### Top movies by rating (average)

In [None]:
%timeit movies_ratings.top_by_ratings(30)

: 

In [None]:
top_by_ratings = movies_ratings.top_by_ratings(30)

print(f'{"Movie":<75}Average rating')
for key, value in top_by_ratings.items():
    print(f'{key:<75}{value}')

: 

### Top movies by rating (median)

In [None]:
%timeit movies_ratings.top_by_ratings(30, metric=Statistics.median)

: 

In [None]:
top_by_ratings = movies_ratings.top_by_ratings(30, metric=Statistics.median)

print(f'{"Movie":<75}Median rating')
for key, value in top_by_ratings.items():
    print(f'{key:<75}{value}')

: 

### Top controversial movies

In [None]:
%timeit movies_ratings.top_controversial(30)

: 

In [None]:
top_controversial = movies_ratings.top_controversial(30)

print(f'{"Movie":<75}Rating variance')
for key, value in top_controversial.items():
    print(f'{key:<75}{value}')

: 

## Ratings.Users analysis
### Distribution of users by ratings count

In [None]:
users_ratings = Ratings.Users(ratings, movies)

: 

In [None]:
%timeit users_ratings.dist_by_ratings_number()

: 

In [None]:
dist_by_ratings_number = users_ratings.dist_by_ratings_number()

print(f'{"User":<8}Number of ratings')
for key, value in dist_by_ratings_number.items():
    print(f'{key:<8}{value}')

: 

### Distribution of users by ratings values (average)

In [None]:
%timeit users_ratings.dist_by_ratings_values()

: 

In [None]:
dist_by_ratings_values = users_ratings.dist_by_ratings_values()

print(f'{"User":<8}Average rating value')
for key, value in dist_by_ratings_values.items():
    print(f'{key:<8}{value}')

: 

### Distribution of users by ratings values (median)

In [None]:
%timeit users_ratings.dist_by_ratings_values(metric=Statistics.median)

: 

In [None]:
dist_by_ratings_values = users_ratings.dist_by_ratings_values(metric=Statistics.median)

print(f'{"User":<8}Median of rating value')
for key, value in dist_by_ratings_values.items():
    print(f'{key:<8}{value}')

: 

### Top of users by variance of their ratings

In [None]:
%timeit users_ratings.top_by_variance(30)

: 

In [None]:
top_by_variance = users_ratings.top_by_variance(30)

print(f'{"User":<8}Variance of ratings')
for key, value in top_by_variance.items():
    print(f'{key:<8}{value}')

: 

## Tags analysis

### Most words

In [None]:
tags = Tags(TAGS_CSV)

: 

In [None]:
%timeit tags.most_words(30)

: 

In [None]:
dist_by_release = tags.most_words(30)

print(f'{"Tag":<90}Number of words')
for key, value in dist_by_release.items():
    print(f'{key:<90}{value}')

: 

### Longest

In [None]:
%timeit tags.longest(30)

: 

In [None]:
longest = tags.longest(30)

print('Tag\n---------')
for tag in longest:
    print(tag)

: 

### Most words and longest

In [None]:
%timeit tags.most_words_and_longest(30)

: 

In [None]:
most_words_and_longest = tags.most_words_and_longest(30)

print('Tag\n---------')
for tag in most_words_and_longest:
    print(tag)

: 

### Most popular

In [None]:
%timeit tags.most_popular(20)

: 

In [None]:
most_popular = tags.most_popular(20)

print(f'{"Tag":<30}Usage number')
for key, value in most_popular.items():
    print(f'{key:<30}{value}')

: 

### Tags with (some word)

In [None]:
word_for_tag = 'history'

: 

In [None]:
%timeit tags.tags_with(word_for_tag)

: 

In [None]:
tags_with = tags.tags_with(word_for_tag)

print(f'Tags with {word_for_tag}\n---------')
for tag in tags_with:
    print(tag)

: 