# Data analysis

> TODO

## Notebook config

In [1]:
VERSION        = "v2"                       # TODO
PREFIX         = f'rec-bandits-{VERSION}'   # TODO

print(f"PREFIX: {PREFIX}")

PREFIX: rec-bandits-v2


In [2]:
# staging GCS
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'

config = !gsutil cat {BUCKET_URI}/config/notebook_env.py
print(config.n)
exec(config.n)


PROJECT_ID               = "hybrid-vertex"
PROJECT_NUM              = "934903580331"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "934903580331-compute@developer.gserviceaccount.com"

PREFIX                   = "rec-bandits-v2"
VERSION                  = "v2"

BUCKET_NAME              = "rec-bandits-v2-hybrid-vertex-bucket"
BUCKET_URI               = "gs://rec-bandits-v2-hybrid-vertex-bucket"
DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://rec-bandits-v2-hybrid-vertex-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"
DATA_PATH_KFP_DEMO       = "gs://rec-bandits-v2-hybrid-vertex-bucket/data/kfp_demo_data/u.data"

VPC_NETWORK_FULL         = "projects/934903580331/global/networks/ucaip-haystack-vpc-network"

BIGQUERY_DATASET_NAME    = "mvlens_rec_bandits_v2"
BIGQUERY_TABLE_NA

In [3]:
# logging
import logging
logging.disable(logging.WARNING)

import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append("..")

# this repo
from src.data import data_config as data_config

In [4]:
# google cloud
from google.cloud import aiplatform, storage

# cloud storage client
storage_client = storage.Client(project=PROJECT_ID)
# bucket = storage_client.bucket(BUCKET_NAME)

# Vertex client
aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [5]:
DATA_SET = "movielens"
TAG      = data_config.EXAMPLE_GEN_GCS_PATH.split("/")[-1]
DATA_TAG = f"{DATA_SET}/{TAG}"

print(f"TAG     : {TAG}")
print(f"DATA_TAG: {DATA_TAG}")

! gsutil ls $DATA_PATH/$DATA_TAG

TAG     : m1m
DATA_TAG: movielens/m1m
gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/train/
gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/val/
gs://rec-bandits-v2-hybrid-vertex-bucket/data/movielens/m1m/vocabs/


## imports 

In [6]:
import csv
import os
import re
import datetime
import collections
import statistics

from matplotlib import pyplot as plt
from matplotlib import dates
import pandas
import seaborn as sns

## Set data path 

```
├── __init__.py
├── data
│   ├── examples
│   └── raw
│       └── datasets
│           ├── ml-1m
│           │   ├── README
│           │   ├── movies.dat
│           │   ├── ratings.dat
│           │   └── users.dat
```

In [19]:
LOCAL_MV_PATH = "../src/data/data/raw/datasets/ml-1m"

test_path = os.path.join(LOCAL_MV_PATH, 'users.dat')
test_path

'../src/data/data/raw/datasets/ml-1m/users.dat'

In [22]:
# sys.path.append("..")

with open(test_path, encoding='ISO-8859-1') as movies_file:
    for line in movies_file:
        print(line)
        break

1::F::1::10::48067



## parser function

In [23]:
def read_data_100k(dir_path):
    """
    Read the MovieLens 100k dataset.

    Args:
    dir_path: a string, which is the path to the directory of the file.

    Returns:
    A tuple of three elements:
      a dict of movies data
      a dict of ratings data
      a dict of users data

    The movies data dict maps a movie_id string to a dict with keys: "title",
        "genres", "release_year", "release_date".
    The ratings data dict contains keys "user_ids",
        "movie_ids", "ratings", "timestamps". Each entry is a list of corresponding
        values. For instance, user_ids[i] corresponds to the user_id value of the
        ith record.
    The users data dict maps a user_id string to a dict with keys: "age",
        "gender", "occupation", "zip_code".
    """
    movies_file_path = os.path.join(dir_path, 'movies.dat')
    users_file_path = os.path.join(dir_path, 'users.dat')
    ratings_file_path = os.path.join(dir_path, 'ratings.dat')

    # A list for converting genre index to genre label for 100K dataset.
    # "Children's" changed to "Children" in the list for consistency.
    all_genre_list = [
        'Unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy',
        'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
        'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western',
    ]
    # A dict for converting month string to month integer
    month_to_int = {
        'Jan': 1,
        'Feb': 2,
        'Mar': 3,
        'Apr': 4,
        'May': 5,
        'Jun': 6,
        'Jul': 7,
        'Aug': 8,
        'Sep': 9,
        'Oct': 10,
        'Nov': 11,
        'Dec': 12,
    }
    movie_data_map = {}
    with open(movies_file_path, encoding='ISO-8859-1') as movies_file:
        for line in movies_file:
        # Row format: <movie id>|<movie title>|<release date>|\
        # <video release date>|<IMDb URL>|<19 fields for each genre>|.
            (
                movie_id,
                movie_title,
                release_date_str,
                _,
                _,
                *genre_bools,
            ) = line.strip().split('|')
            genre_list = []
            for index, genre_indicator in enumerate(genre_bools):
                if genre_indicator == '1':
                    genre_list.append(all_genre_list[index])
            # Search for release year in title
            search_result = re.findall('\([12][09][0-9][0-9]\)', movie_title)
            if search_result is None or len(search_result) != 1:
                print('Found released title anomaly')
                print(line)
                release_year = None
            else:
                release_year = int(search_result[0][1:5])
            if len(release_date_str) == 11:
                day_str, month_str, year_str = release_date_str.split('-')
                release_date = datetime.date(
                    int(year_str),
                    month_to_int[month_str],
                    int(day_str)
                )
            else:
                release_date = None

            movie_data_map[movie_id] = {
                'title': movie_title,
                'genres': genre_list,
                'release_year': release_year,
                'release_date': release_date,
            }

    user_data_map = {}
    with open(users_file_path, mode='r', encoding='ISO-8859-1') as users_file:
        for line in users_file:
            (
                user_id,
                age,
                gender_str,
                occupation_str,
                zip_code,
            ) = line.strip().split('|')
            
            user_data_map[user_id] = {
                'gender': gender_str,
                'age': int(age),
                'occupation': occupation_str,
                'zip_code': zip_code,
            }

    user_id_list = []
    movie_id_list = []
    rating_list = []
    timestamp_list = []
    with open(ratings_file_path, mode='r', encoding='ISO-8859-1') as ratings_file:
        for i, line in enumerate(ratings_file):
            user_id, movie_id, rating, timestamp = line.strip().split('\t')
            user_id_list.append(user_id)
            movie_id_list.append(movie_id)
            rating_list.append(float(rating))
            timestamp_list.append(datetime.datetime.fromtimestamp(int(timestamp)))
    
    rating_dict = {
        'user_id_list': user_id_list,
        'movie_id_list': movie_id_list,
        'rating_list': rating_list,
        'timestamp_list': timestamp_list,
    }
  
    return movie_data_map, rating_dict, user_data_map

## Utility functions

In [24]:
def gen_histogram(
    value_list,
    title=None,
    x_label=None,
    y_label=None,
    dpi=120,
    **kwargs,
):
    mean = statistics.mean(value_list)
    stdev = statistics.stdev(value_list)
    plt.figure(dpi=dpi)
    plt.hist(value_list, **kwargs)
    plt.axvline(mean, ls='dashed')
    plt.axvline(mean - stdev, ls='dotted')
    plt.axvline(mean + stdev, ls='dotted')
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)

def gen_bar_graph(
    item_list,
    title=None,
    x_label=None,
    y_label=None,
    figsize=None,
    dpi=120,
    **kwargs,
):
    item_counter = collections.Counter(item_list)
    count_list = list(item_counter.items())
    count_list.sort(reverse=True, key=lambda x: x[1])
    item_list, count_list = zip(*count_list)
    plt.figure(dpi=120, figsize=figsize)
    plt.bar(item_list, count_list, **kwargs)
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)

def gen_count_distribution_graph(item_list, **kwargs):
    item_counter = collections.Counter(item_list)
    count_list = [c for _, c in item_counter.items()]
    gen_histogram(count_list, **kwargs)
    return count_list

def dict_to_list(input_dict, keys, none_check=None):
    record_lists = [[] for _ in keys]
    for _, item in input_dict.items():
        for i, key in enumerate(keys):
            if none_check is not None and key in none_check and item[key] is None:
                continue
            record_lists[i].append(item[key])
    
    final_dict = {}
    for i, key in enumerate(keys):
        final_dict[key + '_list'] = record_lists[i]
    
    return final_dict

def categorical_boxplot(
    num_col,
    cat_col,
    num_name,
    cat_name,
    title=None,
    figsize=None,
    showmeans=True,
    order=None,
):
    combined_df = pandas.DataFrame([cat_col, num_col]).transpose()
    combined_df.columns = [cat_name, num_name]
    plt.figure(figsize=figsize)
    sns.boxplot(x=cat_name, y=num_name, data=combined_df, showmeans=showmeans, order=order)
    plt.title(title)

def get_denormalized_col_dict(rating_col_dict, movie_dict, user_dict):
    movie_genres_list = []
    movie_release_year_list = []
    movie_release_date_list = []
    user_gender_list = []
    user_age_list = []
    user_occupation_list = []
    user_zip_code_list = []
    for i in range(len(rating_col_dict['user_id_list'])):
        user_id = rating_col_dict['user_id_list'][i]
        user_info = user_dict[user_id]
        user_gender_list.append(user_info['gender'])
        user_age_list.append(user_info['age'])
        user_occupation_list.append(user_info['occupation'])
        user_zip_code_list.append(user_info['zip_code'])
        movie_id = rating_col_dict['movie_id_list'][i]
        movie_info = movie_dict[movie_id]
        movie_genres_list.append(movie_info['genres'])
        movie_release_year_list.append(movie_info['release_year'])
        movie_release_date_list.append(movie_info['release_date'])
    return {
        **rating_col_dict,
        'movie_genres_list': movie_genres_list,
        'movie_release_year_list': movie_release_year_list,
        'movie_release_date_list': movie_release_date_list,
        'user_gender_list': user_gender_list,
        'user_age_list': user_age_list,
        'user_occupation_list': user_occupation_list,
        'user_zip_code_list': user_zip_code_list,
    }

### Anomalies

> 100k dataset

*   The movie with id 267 appears to be an empty movie. It have "unknown" title, no release dates, and no genre. However, it is referenced by 9 rating records.
*   The release years from the titles are often different from the release dates field. The release years from the titles are the years in which movies are initially released. The dates from the release date field are dates when movies are released in US theatres. Movies may have different release year and release date if it
  *   had initial limited release
  *   was first released in a country other than US
  *   was released multiple times (old movies)

## Read data

In [25]:
movie_dict, rating_col_dict, user_dict = read_data_100k(LOCAL_MV_PATH)

ValueError: not enough values to unpack (expected at least 5, got 3)

# Statistical plots of ratings data:

## Distribution of Rating Scores

Below is the plot of distribution of rating scores. Each bar between i and j represent the number of ratings with score i.

The plot shows that the distribution is left skewed. There are multiple potential reason for this

*   Users who would watch and rate a movie are likely interested in the movie already and are likely to give high scores (selection bias).
*   Users do not tend to give low scores (1-2) unless they are really disappointed.

Such skewness might make a model likely to predict high rating scores. However, when recommending a movie to a user who did not choose the movie him/herself, the probability distribution of the user's rating of the movie can be different from that of the users who voluntarily watched and rated the movie. Therefore the model probably will need adjusting for such bias.

## Plot of the Distribution of Rating Scores

In [None]:
ratings_bins = [1 * i - 0.5 for i in range(1, 7)]

gen_histogram(
    [rating - 0.5 for rating in rating_col_dict['rating_list']],
    title='Distribution of Rating Scores',
    x_label='Rating Score',
    y_label='Count',
    bins=ratings_bins,
)

## Number of Ratings over Time

> Below is the plot of number of ratings over time. The dates are formatted as `yy:mm:dd` The plot shows a high spike just around 97.11.22. The cause of the spike is not immediately clear. However, such imbalance might cause a model to learn more of the data distribution around 97.11.22 and overlook the data distribution at other times.

### Plot of the Number of Ratings over Time

In [None]:
plt.figure(dpi=120)
plt.hist([dates.date2num(v) for v in rating_col_dict['timestamp_list']], bins=20)
plt.axes().xaxis.set_major_formatter(dates.DateFormatter('%y.%m.%d'))
plt.title('Number of Ratings over Time')

## Distribution of Rating Frequency of Each User

Below is a plot of the rating frequency of each user. Note that the dataset only contains users who have rated at least 20 movies. The plot shows a heavy tailed distribution with most users rating only around 30 to 40 movies, but there are also outlier users who have rated more than 400 movies.

This is a natural distribution as most users would not spend too much time on rating movies, but there exist some movie fans who might be especially willing to rate many movies.

As shown in the cell, 0.16 of all users contributed 0.44 of all ratings. As a result, a model may be heavily influenced by the small group of users who contributed many ratings. The information from users who rated fewer movies might be covered up.

In [None]:
# Plot of the Distribution of Rating Frequency of Each User

user_count_list = gen_count_distribution_graph(
    rating_col_dict['user_id_list'],
    title='Distribution of Number of Ratings of Each User',
    x_label='Number of Ratings',
    y_label='Number of Users',
    bins=20,
)

total_sum = sum(user_count_list)
heavy_user_count_list = [count for count in user_count_list if count > 200]
heavy_user_count_sum = sum(heavy_user_count_list)
print('%.2f of all users rated over 200 movies' % (float(len(heavy_user_count_list)) / len(user_count_list)))
print('These users contributed %.2f of the total ratings' % (float(heavy_user_count_sum) / total_sum))

## Distribution of Rating Frequency of Each Movie

Below is the plot of the distribution of rating frequency of each movie. Similar to the previous plot, it also shows a heavy tailed distribution with most movies having fewer than 50 ratings. However, some movies have over 300 ratings.

This is natural distribution as most movies would not receive too many ratings except a few blockbuster movies.

As shown in the cell, 0.12 of all movies had 0.48 of all ratings. If not handled properly, a model would be more likely to recommend a movie with many ratings rather than movies with fewer ratings. As a result, the recommendation might be dominated by popular movies and the less popular ones might be ignored.

### Plot of the Distribution of Rating Frequency of Each Movie

In [None]:
# Plot of the Distribution of Rating Frequency of Each Movie

movie_count_list = gen_count_distribution_graph(
    rating_col_dict['movie_id_list'],
    title='Distribution of Number of Ratings of Each Movie',
    x_label='Number of Ratings',
    y_label='Number of Movies',
)

total_sum = sum(movie_count_list)
heavy_movie_count_list = [count for count in movie_count_list if count > 150]
heavy_movie_count_sum = sum(heavy_movie_count_list)
print('%.2f of all movies had over 150 ratings' % (float(len(heavy_movie_count_list)) / len(movie_count_list)))
print('These movies had %.2f of the total ratings' % (float(heavy_movie_count_sum) / total_sum))

# Statistical plots of movies data:

## Convert `movie_dict` to Dictionary of Columns

In [None]:
# Convert `movie_dict` to Dictionary of Columns

movie_col_dict = dict_to_list(
    movie_dict,
    ['genres', 'release_date'],
    none_check=['release_date'],
)

## Distribution of Movie Genre

> Below is a plot of the genre distribution of all movies. As shown, drama and comedy genres contain the most movies. The number of movies each of these two genres contain is over twice the number of movies any other genre contains. When using the genre information, a model is likely to give higher score to movies of more popular genres.

### Plot of the Genre Distribution

In [None]:
# Plot of the Genre Distribution

all_genre_list = []

for genre_list in movie_col_dict['genres_list']:
    all_genre_list.extend(genre_list)

gen_bar_graph(
    all_genre_list,
    title='Genre Distribution',
    x_label='Genres',
    y_label='Count',
    figsize=(22, 6),
)

## Number of Movies by Release Date

Below is a plot of the distribution of movie release date. As shown in the plot, most movies in the dataset are released near the time this dataset was generated. This might be due to the fact that people are more likely to watch and rate up-to-date movies than old movies. New movies likely have better quality because of advancement in film making techniques and are more accessible than old movies.

A model using the release date information is likely to recommend mostly new movies. However, if a user shows special interest in older movies, it might be good for the model to recommend old movies over new movies.

### Plot of the Release Date Distribution

In [None]:
# Plot of the Release Date Distribution

plt.figure(dpi=120)
plt.hist(
    [dates.date2num(v) for v in movie_col_dict['release_date_list']],
    bins=20,
)
plt.axes().xaxis.set_major_formatter(dates.DateFormatter('%y.%m.%d'))
plt.title('Number of Movies by Release Date')
plt.xlabel('Release Date')
plt.ylabel('Number of Movies')

# Statistical plots of users data:

## Convert `user_dict` to Dictionary of Columns

In [None]:
# Convert `user_dict` to Dictionary of Columns

user_col_dict = dict_to_list(
    user_dict,
    ['gender', 'age', 'occupation', 'zip_code'],
)

### Plot of Gender Distribution

> Below is a plot of the user gender distribution. The dataset contains much more male users than female users. As a result, the model might perform worse on the female population than on the male population. Such imbalance can cause performance and gender bias issues.

In [None]:
gen_bar_graph(
    user_col_dict['gender_list'],
    title='Gender Distribution',
    x_label='Gender',
    y_label='Count',
)

### Plot of Age Distribution

Below is a plot of user age distribution. Most users are in the 20-30 age range. Age ranges of below 15 and above 52 have the least number of users. People who are too young are unlikely to have access to technology, and people who are old might have difficulty using technology.

Due to such imbalance, a model is likely to have varying performances on different age groups. It is likely to perform better on users of age between 20-30 and worse on users of age below 15 or above 52. Furthermore, the movie taste of different age groups can be different. It might be bad for a model to recommend a movie targeting users between 20 and 30 to a user who is over 60. More importantly, a model should not recommend movies for adults with explicit content to younger users.

In [None]:
gen_histogram(
    user_col_dict['age_list'],
    title='Age Distribution',
    x_label='Age',
    y_label='Number of Users'
)

### Plot of Occupation Distribution

Below is a plot of user occupation distribution. The occupation with the most users is student, which almost twice the number of users of any other occupation. Such distribution also explains the large number of users between 20-30.

When using occupation data, imbalance again poses a threat to the model. The model is likely to perform better on users of frequently occurring occupation.

In [None]:
gen_bar_graph(
    user_col_dict['occupation_list'],
    title='User Occupation Distribution',
    x_label='Occupation',
    y_label='Number of Users',
    figsize=(26, 6),
    width=0.6
)

### Plot of Zip Code Frequency Distribution

> Below is a plot of user zip code distribution. It resembles the other frequency distribution plots but has a lighter tail. Most zip codes appear only once in the dataset, but there are also zip codes appearing more frequently (over 4 times). However, these zip codes are rare and they do not appear significantly more frequently in the user data. Hence, the outlier zip codes would not have as much influence as the outlier users or movies in previous plots.

In [None]:
count_list = gen_count_distribution_graph(
    user_col_dict['zip_code_list'],
    title='Zip Code Count Distribution',
    x_label='Zip Code Count',
    y_label='Frequency',
)

# Statistical plots of joined data

### Denormalize Data

In [None]:
joined_data = get_denormalized_col_dict(rating_col_dict, movie_dict, user_dict)

## Number of ratings for each genre

Below is a plot of the number of ratings for each genre. The distribution of ratings over genres is a bit more even than the distribution of movies over genres. The rankings of the genres are similar, but there are still differences. For instance, there are more movies under the children genre than under the Sci-Fi genre, but the movies under the Sci-Fi genre have more ratings than those under the children genre. The reason might be that the audience of children's movies are less likely to make a rating than the audience of Sci-Fi movies.

In [None]:
all_genre_list = []
for genre_list in joined_data['movie_genres_list']:
    all_genre_list.extend(genre_list)

gen_bar_graph(
    all_genre_list,
    title='Number of Ratings of Each Movie Genre',
    x_label='Genres',
    y_label='Number of Ratings',
    figsize=(22, 6),
)

## Rating Score Distribution of Different Genres

Below is a plot of the rating score distribution of each genre. As shown in the plot, the rating score distributions are very similar across genres. All mean rating scores (marked by green triangles) are between 3 and 4. However, film-noir, drama, war, and documentary are four genres with higher rating scores. Hence it is possible that a ranking model might give movies of these genres high scores even when a user might not actually like movies of these genres.

In [None]:
expanded_genre_list = []
expanded_rating_list = []
for i, genre_seq in enumerate(joined_data['movie_genres_list']):
    for genre in genre_seq:
        expanded_genre_list.append(genre)
        expanded_rating_list.append(joined_data['rating_list'][i])

categorical_boxplot(
    expanded_rating_list,
    expanded_genre_list,
    'rating',
    'genre',
    title='Distribution of Rating Score over Different Genres',
    figsize=(24, 5),
)

## Rating Score over Hour of a Day

Below is a plot of the rating score distribution over different hours of a day. 0 represents the first hour of the day whereas 23 represents the last hour of the day. The score distributions are mostly the same, except for between 9 am and 4 pm. The score distribution over the hours in this time interval fluctuates. Below the plot of distributions of rating scores, we also plotted a distribution of the number of ratings. It is clear from the plot that the fluctuations are likely to be caused by the small number of ratings during those hours rather than a real pattern in user behavior.

In [None]:
hour_in_day_list = [
    (int(timestamp.timestamp()) % 86400) // 3600
    for timestamp in joined_data['timestamp_list']
]

hour_in_day_ratings = [[] for _ in range(24)]

for hour_in_day, rating in zip(hour_in_day_list, joined_data['rating_list']):
     hour_in_day_ratings[hour_in_day].append(rating)

plt.figure(figsize=(20, 5))
sns.boxplot(data=hour_in_day_ratings, showmeans=True)
plt.title('Distribution of Rating Scores over Hours in a Day')
plt.xlabel('Hour in a Day')
plt.ylabel('Rating Score')

hour_in_day_rating_count = [[len(hour_ratings)] for hour_ratings in hour_in_day_ratings]

plt.figure(figsize=(20, 5))
sns.barplot(data=hour_in_day_rating_count)
plt.title('Number of Ratings over Hours in a Day')
plt.xlabel('Hour in a Day')
plt.ylabel('Number of Ratings')

## Occupation and Timestamp Interaction

We explore the interaction between occupation and timestamp features below. Two features interact when the effect of one feature on the user's choices is dependent on another feature. In the case of occupation and timestamp features, we investigate whether the preferences of users of a specific occupation change over time.

To investigate such effect, we first group the ratings by user occupations. Then, for a specified occupation, we plot the user preferences over months with multiple plots. Since there are too many movies and the ratings for each movie might fluctuate a lot, we will show user preferences by their preferences of movie genres. For each month, we plot the rating score distribution over different genres.

Since most users are students, we first plot the preferences of students over time.

In [None]:
specified_occupation = 'student'

specified_occupation_examples = {
    'rating_list': [],
    'timestamp_list': [],
    'genre_list': [],
}
for i, occupation in enumerate(joined_data['user_occupation_list']):
    if occupation == specified_occupation:
        specified_occupation_examples['rating_list'].append(joined_data['rating_list'][i])
        specified_occupation_examples['timestamp_list'].append(joined_data['timestamp_list'][i])
        specified_occupation_examples['genre_list'].append(joined_data['movie_genres_list'][i])

rating_by_month = [{'rating_list': [], 'genre_list': []} for _ in range(8)]
for i, timestamp in enumerate(specified_occupation_examples['timestamp_list']):
    month_index = timestamp.month - 9 + ((timestamp.year - 1997) * 12)
    for genre in specified_occupation_examples['genre_list'][i]:
        rating_by_month[month_index]['rating_list'].append(specified_occupation_examples['rating_list'][i])
        rating_by_month[month_index]['genre_list'].append(genre)

for month_index in range(8):
    categorical_boxplot(
        rating_by_month[month_index]['rating_list'],
        rating_by_month[month_index]['genre_list'],
        'rating',
        'genre',
        title='Rating Score Distribution over Genres during Month {:d} for Occupation {:s}'.format(month_index + 1, specified_occupation),
        figsize=(20, 3),
        order=[
            'Unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy',
            'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
            'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western',
        ],
    )

There are some fluctuations in the rating score distribution. One of the genres with large fluctuations is documentary. In the first two months it has an average rating of around 4 and in the 7th month the average rating dropped to almost 3. However, documentary is also one of the least watched genres. Its fluctionations might simply be caused by a small sample size. On the other hand, genres such as drama and comedy have relatively stable average ratings.

Since the action of rating a movie shows preference as well, we can also use the number of ratings over genres as a sign of movie preferences. We still investigate the student users. Below are the plots.

In [None]:
for month_index in range(8):
    gen_bar_graph(
        rating_by_month[month_index]['genre_list'],
        title='Number of Rated Movies over Genres during Month {:d} for Occupation {:s}'.format(
            month_index + 1, specified_occupation
        ),
        x_label='Genres',
        y_label='Count',
        figsize=(18, 2),
    )

# inspect TF-Records

In [None]:
import os
import numpy as np
import pandas as pd
from pprint import pprint
import matplotlib.pyplot as plt
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import collections
import functools
import io
import os
import requests

In [None]:
import sys
sys.path.append("..")

# this repo
from src.data import data_utils, data_config

In [None]:
EXAMPLE_GEN_GCS_PATH = data_config.EXAMPLE_GEN_GCS_PATH
GCS_DATA_PATH = f"{BUCKET_URI}/{EXAMPLE_GEN_GCS_PATH}"

print(f"GCS_DATA_PATH: {GCS_DATA_PATH}")

!gsutil ls $GCS_DATA_PATH

## get decoded features

In [None]:
feature_description = {
    # target/label item features
    'target_movie_id': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
    'target_movie_rating': tf.io.FixedLenFeature(shape=(), dtype=tf.float32),
    'target_rating_timestamp': tf.io.FixedLenFeature(shape=(), dtype=tf.int64),
    'target_movie_genres': tf.io.FixedLenFeature(shape=(data_config.MAX_GENRE_LENGTH), dtype=tf.string),
    'target_movie_year': tf.io.FixedLenFeature(shape=(), dtype=tf.int64),
    'target_movie_title': tf.io.FixedLenFeature(shape=(), dtype=tf.string),

    # user - global context features
    'user_id': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
    'user_gender': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
    'user_age': tf.io.FixedLenFeature(shape=(), dtype=tf.int64),
    'user_occupation_text': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
    'user_zip_code': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
}

options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.AUTO

In [None]:
SPLIT = "train" # "train" | "val"

train_files = []
for blob in storage_client.list_blobs(f"{BUCKET_NAME}", prefix=f'{EXAMPLE_GEN_GCS_PATH}/{SPLIT}'):
    if '.tfrecord' in blob.name:
        train_files.append(blob.public_url.replace("https://storage.googleapis.com/", "gs://"))
        
train_files = train_files[:1]
# train_files

train_dataset = tf.data.TFRecordDataset(train_files)
train_dataset = train_dataset.map(data_utils._parse_function)

for x in train_dataset.batch(1).take(1):
    pprint(x)

In [None]:
# test_iter = train_dataset.as_numpy_iterator()

for element in train_dataset.as_numpy_iterator():
    pprint(element)
    break

In [None]:
df = pd.DataFrame(
    train_dataset.as_numpy_iterator(),
    columns=[
        'target_movie_genres', 
        'target_movie_id', 
        'target_movie_rating',
        'target_movie_title',
        'target_movie_year',
        'target_rating_timestamp',
        'user_age',
        'user_gender',
        'user_id',
        'user_occupation_text',
        'user_zip_code',
    ]
)
print('Shape', df.shape)
df.head()

In [None]:
df['target_movie_genres'][0].tolist()[0].decode()

#.bytes_list.value[0].decode()

In [None]:
print('Num users:', len(set(df.user_id)))
print('Num movies:', len(set(df.target_movie_id)))

### movie ratings

In [None]:
ratings = df.target_movie_rating.tolist()

plt.hist(ratings, bins=5)
plt.xticks([1, 2, 3, 4, 5])
plt.ylabel('Count')
plt.xlabel('Rating')
plt.show()

print('Average rating:', np.mean(ratings))
print('Median rating:', np.median(ratings))

### movie genres

In [None]:
movie_genres_list = df.target_movie_genres.tolist()
movie_genres_list[0]

In [None]:
# Count the number of times each genre describes a movie.
genre_count = collections.defaultdict(int)
counter = 0
for genres in movie_genres_list:
    curr_genres_list = genres.tolist()
    for genre in curr_genres_list:
        genre = genre.decode()
        if genre != "UNK":
            genre_count[genre] += 1
    # if counter == 3:
    #     break
    # counter += 1    # break

genre_name_list, genre_count_list = zip(*genre_count.items())

genre_count

In [None]:
plt.figure(figsize=(11, 11))
plt.pie(genre_count_list, labels=genre_name_list, autopct='%1.1f%%')
plt.title('MovieLens Movie Genres')
plt.show()


# fig, ax = plt.subplots()
# ax.pie(sizes, labels=labels, autopct='%1.1f%%')

## User genre ratings

In [None]:
def print_top_genres_for_user(df, user_id):
    """
    Prints top movie genres for user with ID user_id.
    """
    user_ratings_df = df[df.user_id == user_id]
    num_user_records = user_ratings_df.shape[0]
    print(f"\nUser: {user_id.decode()} has {num_user_records} records")
    
    movie_ids = user_ratings_df.target_movie_id
    genre_count = collections.Counter()
    
    for movie_id in movie_ids:
        genres_list = df[df.target_movie_id == movie_id].target_movie_genres.tolist()[0].tolist()
        for genre in genres_list:
            genre = genre.decode()
            if genre != "UNK":
                genre_count[genre] += 1
                
    # print(f'Genre: freq - % total')
    for (genre, freq) in genre_count.most_common(5):
        print(f'{genre}: {freq} - {round((freq/num_user_records), 2)} %')

In [None]:
# test_user_id = b'5155'
# test_user_id.decode()
# movie_genres_list[0].tolist()
# df.user_id.tolist()

user_list = [
    b'2173',
    b'5155',
    b'4464',
    b'1180',
    b'258',
    b'5637',
    b'3100',
    b'1201',
    b'1976',
]

for user in user_list:
    print_top_genres_for_user(df, user_id=user)

In [None]:
test_user_ratings_df.shape[0]