In [None]:
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['font.family'] = "Hiragino Sans"
# plt.rcParams['font.weight'] = ""

In [None]:
movies = pd.read_csv("../data/ml-1m/movies.csv")
movies

In [None]:
fig = plt.figure(figsize=(16, 8))

In [None]:
genre_count = {}
for genres in movies.genre.apply(lambda s: s.split("|")):
    for genre in genres:
        if genre not in genre_count:
            genre_count[genre] = 0
        genre_count[genre] += 1

genre_count = sorted(genre_count.items(), key=lambda x: x[1], reverse=True)
keys = list(map(lambda x: x[0], genre_count))
values = list(map(lambda x: x[1], genre_count))

ax = fig.add_subplot(2, 3, 1)
ax.set_title("映画のジャンルの分布")
ax.bar(keys, values, color="#274A78")
ax.tick_params('x', labelrotation=90)
fig

In [None]:
year_count = movies.year.value_counts()

ax = fig.add_subplot(2, 3, 2)
ax.set_title("映画の公開年の分布")
ax.bar(year_count.keys(), year_count, width=6.0, color="#274A78")
fig

In [None]:
ratings = pd.read_csv("../data/ml-1m/ratings.dat", sep="::", engine="python", header=None)
ratings.columns = ["user_id", "movie_id", "rate", "timestamp"]
ratings

In [None]:
rate_count = ratings.rate.value_counts()
ax = fig.add_subplot(2, 3, 3)
ax.set_title("映画の評価の分布")
ax.bar(rate_count.keys(), rate_count, color="#274A78")
fig

In [None]:
users = pd.read_csv("../data/ml-1m/users.csv")
users

In [None]:
age_count = users.age.value_counts()
keys = [
    "Under 18",
    "18-24",
    "25-34",
    "35-44",
    "45-49",
    "50-55",
    "56+",
]
ax = fig.add_subplot(2, 3, 4)
ax.set_title("ユーザの年齢層の分布")
ax.bar(keys, list(map(lambda x: age_count[x], keys)), color="#274A78")
fig.tight_layout()

In [None]:
gender_count = users.gender.value_counts()
ax = fig.add_subplot(2, 3, 5)
ax.set_title("ユーザの性別の分布")
ax.bar(["Male", "Female"], gender_count, color="#274A78")
fig

In [None]:
occupation_count = users.occupation.value_counts()

ax = fig.add_subplot(2, 3, 6)
ax.tick_params('x', labelrotation=90)
ax.set_title("ユーザの職種の分布")
ax.bar(occupation_count.keys(), occupation_count, color="#274A78")

In [None]:
fig.tight_layout()
fig.savefig("data/ml-statistics.pdf", format="pdf")

In [None]:
fig