In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt

In [None]:
dir_data = '../data/ml-10M100K'

In [None]:
# movies
m_cols = ['movie_id', 'title', 'genres']
df_movies = pd.read_csv(
    os.path.join(dir_data, 'movies.dat'),
    sep='::',
    encoding='latin-1',
    engine='python',
    names=m_cols,
)
df_movies['genres'] = df_movies['genres'].str.split('|')
df_movies.head()

In [None]:
# user-defined tags
t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']
df_tags = pd.read_csv(
    os.path.join(dir_data, 'tags.dat'),
    sep='::',
    engine='python',
    names=t_cols,
)
df_tags['tag'] = df_tags['tag'].str.lower()
df_tags['timestamp'] = pd.to_datetime(df_tags['timestamp'], unit='s')
df_tags.head()

In [None]:
# merge movies and tags
df_movie_tags = df_tags.groupby('movie_id')['tag'].apply(list).reset_index()
df_movies = df_movies.merge(df_movie_tags, on='movie_id', how='left')
df_movies.head()

In [None]:
# ratings
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
df_ratings = pd.read_csv(
    os.path.join(dir_data, 'ratings.dat'),
    sep='::',
    engine='python',
    names=r_cols,
)
df_ratings['timestamp'] = pd.to_datetime(df_ratings['timestamp'], unit='s')
df_ratings.head()

## 色々な情報一覧

In [None]:
print('映画数:', df_movies.shape[0])
print('uniqueなtitleの映画数:', df_movies['title'].nunique())
# 複数回現れる映画が一つある
title_multi = df_movies.value_counts(subset='title').reset_index().where(lambda x: x['count'] > 1).dropna()['title'][0]
print('複数含まれている映画:', title_multi)

In [None]:
# genres
print('ジャンル数:', df_movies.explode('genres')['genres'].nunique()
      )
print('ジャンル一覧:', df_movies.explode('genres')['genres'].unique())

plt.bar(
    df_movies.explode('genres')['genres'].value_counts().index,
    df_movies.explode('genres')['genres'].value_counts().values
)
plt.xticks(rotation=90)
plt.xlabel('genre')
plt.ylabel('number of movies')
plt.show()

In [None]:
# tags
print('タグ数:', df_tags['tag'].nunique())
print('タグレコード数:', df_tags.shape[0])
print('タグが付いている映画数:', df_movie_tags['tag'].apply(lambda x: len(x) > 0).sum())
print('タグ付与最初日:', df_tags['timestamp'].min())
print('タグ付与最終日:', df_tags['timestamp'].max())
df_tags.groupby(df_tags['timestamp'].dt.to_period('Q'))['tag'].count().plot.bar()
plt.show()

In [None]:
# ratings
print('ユーザ数:', df_ratings['user_id'].nunique())
print('レーティング付与映画数:', df_ratings['movie_id'].nunique())
print('レーティング付与最初日:', df_ratings['timestamp'].min())
print('レーティング付与最終日:', df_ratings['timestamp'].max())
df_ratings.groupby(df_ratings['timestamp'].dt.to_period('Q'))['rating'].count().plot.bar()
plt.show()

In [None]:
# ratings付与数上位映画
df_ratings['movie_id'].value_counts().reset_index().merge(df_movies[['movie_id', 'title']], on='movie_id')[['title', 'count']].head(20)

In [None]:
# ratings分布
df_ratings['rating'].value_counts().sort_index().plot.bar()
plt.show()

In [None]:
# データ量が多いので1000人に絞る
valid_user_ids = sorted(df_ratings['user_id'].unique()[:1000])
(
    df_ratings[df_ratings['user_id'].isin(valid_user_ids)]
    .groupby('user_id').agg({'movie_id': len})
    .agg(['min', 'max', 'mean', 'median', 'std', 'count'])
)

In [None]:
(
    df_ratings[df_ratings['user_id'].isin(valid_user_ids)]
    .groupby('movie_id').agg({'user_id': len})
    .agg(['min', 'max', 'mean', 'median', 'std', 'count'])
)