In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.feature_extraction import DictVectorizer
import matplotlib.pyplot as plt

In [2]:
animes = pd.read_csv("anime.csv")
tags = []
for genres in animes["genre"].fillna(""):
    for genre in str.split(genres, ", "):
        tags.append(genre)
tags = sorted(list(set(tags))[1:]) # Remove ''
v = DictVectorizer(sparse="False")
genre = v.fit_transform(animes["genre"].fillna(", ".join(tags)).apply(lambda x: {i: 1 for i in str.split(x, ", ")}))
genre = np.array(genre.todense())
animes_rating = animes["rating"].as_matrix()
animes_id = animes["anime_id"].as_matrix()
inverse_anime_id = {}
for index, anime in np.ndenumerate(animes_id):
    inverse_anime_id[anime] = index[0]

In [73]:
rating = pd.read_csv("rating.csv", dtype=np.int)
user_num = 73516
anime_num = 34519
avg_rating = np.zeros(user_num+1)
watch_times = np.zeros(user_num+1)
rating_times = np.zeros(user_num+1)
user_rating = {}
for i in range(0, user_num + 1):
    user_rating[i] = []
overall_rating = 0
total_rating_times = 0
for user_id, r in zip(rating["user_id"], rating["rating"]):
    watch_times[user_id] += 1
    if r != -1:
        avg_rating[user_id] += r
        rating_times[user_id] += 1
        overall_rating += r
        total_rating_times += 1
        user_rating[user_id].append(r)

np.seterr(divide='ignore', invalid='ignore')
avg_rating = np.nan_to_num(avg_rating / rating_times)
avg_overall_rating = overall_rating / total_rating_times

std_rating = np.zeros(user_num + 1)
for i in range(0, user_num + 1):
    if not user_rating[i]:
        continue
    std_rating[i] = np.std(user_rating[i])

In [4]:
ratings = np.zeros((user_num+1, 43))
user_tags = np.zeros((user_num+1, 43))
for user_id, anime_id, r in zip(rating["user_id"], rating["anime_id"], rating["rating"]):
    if anime_id not in inverse_anime_id.keys():
    # if anime_id not in animes_id:
        continue
    user_tags[user_id] += genre[inverse_anime_id[anime_id]]
    if rating_times[user_id] == 0:
        ratings[user_id] += genre[inverse_anime_id[anime_id]] * 5
    
    elif r == -1:
        ratings[user_id] += genre[inverse_anime_id[anime_id]] * (10 - avg_rating[user_id])
    else:
        ratings[user_id] += genre[inverse_anime_id[anime_id]] * r


In [5]:
user_instances = np.nan_to_num(ratings / user_tags)

In [6]:
with open("anime_tag_vector.csv", "w") as file:
    file.write("anime_id" + "," + ",".join(tags) + ",rating\n")
    for a, g, r in zip(animes_id, genre, animes_rating):
        file.write("{},{},{}\n".format(a, ",".join([str(i) for i in g]), r))

In [7]:
with open("user_instances.csv", "w") as file:
    file.write("user_id" + "," + ",".join(tags) + "\n")
    for index, ui in enumerate(user_instances):
        file.write("{},{}\n".format(index, ",".join([str(i) for i in ui])))

# Evaluation Input

In [8]:
with open("evaluation_input.csv", "w") as file:
    file.write("user_id, anime_id, rating_id\n")
    for user_id, anime_id, r in zip(rating["user_id"], rating["anime_id"], rating["rating"]):
        file.write("{},{},".format(user_id, anime_id))
        if r == -1:
            file.write("{}\n".format(10 - avg_rating[user_id]))
        else:
            file.write("{}\n".format(r))

# Data Mining

In [9]:
print(sum(watch_times) / user_num)
print(sum(sorted(watch_times)[:len(watch_times)//2]) / (user_num / 2))
np.count_nonzero(rating_times)

106.286209805
21.5307144023


69600

watch_times[user_id] 每個 user 看了幾部動畫

rating_times[user_id] 每個 user 評了幾部動畫

avg_rating[user_id] 每個 user 平均評了幾分

user_num user 總數

rating  user raw_data

animes 動畫 raw_data

In [84]:
import plotly.plotly as py
from plotly.graph_objs import *
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)

In [11]:
print(user_num)

rated_percentage = {}
for i in range (0, 101):
    rated_percentage[i] = [i, 0]

for i in range (1, user_num + 1):
    if watch_times[i] == 0:
        continue
    r = int(100 * rating_times[i] / watch_times[i])
    rated_percentage[r][1] += 1
    
df = pd.DataFrame(rated_percentage)
print(df)
data = [Bar(x=df.values[0], y=df.values[1])]
iplot(data, filename='jupyter/basic_bar')

73516
    0    1    2    3    4    5    6    7    8    9    ...    91   92   93   \
0     0    1    2    3    4    5    6    7    8    9  ...     91   92   93   
1  4273  405  292  280  238  236  179  181  160  141  ...    642  918  921   

    94    95    96    97    98    99     100  
0    94    95    96    97    98    99    100  
1  1060  1423  1793  2309  3403  3751  36387  

[2 rows x 101 columns]


In [12]:
print(animes_rating)

rating_dist = {}
for i in range (0, 101):
    rating_dist[i] = [i / 10, 0]

for r in animes_rating:
    if np.isnan(r):
        continue
    rating_dist[int(10 * r)][1] += 1

df = pd.DataFrame(rating_dist)

print(df)
data = [Bar(x=df.values[0], y=df.values[1])]
iplot(data, filename='jupyter/basic_bar')

[ 9.37  9.26  9.25 ...,  4.88  4.98  5.46]
   0    1    2    3    4    5    6    7    8    9    ...   91   92   93   94   \
0  0.0  0.1  0.2  0.3  0.4  0.5  0.6  0.7  0.8  0.9  ...   9.1  9.2  9.3  9.4   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   7.0  3.0  2.0  0.0   

   95   96   97   98   99    100  
0  9.5  9.6  9.7  9.8  9.9  10.0  
1  1.0  1.0  0.0  0.0  0.0   1.0  

[2 rows x 101 columns]


In [113]:
rating_dist = {}
for i in range (-1, 11):
    rating_dist[i] = [i, 0]

print(len(rating["rating"]))
for r in rating["rating"]:
    if np.isnan(r):
        continue
    rating_dist[r][1] += 1

df = pd.DataFrame(rating_dist)

data = [Bar(x=df.values[0], y=df.values[1])]
iplot(data, filename='jupyter/basic_bar')

7813737


In [111]:
watch_times_dist = np.zeros(anime_num + 1)
for t in watch_times:
    watch_times_dist[int(t)] += 1

watch_times_acc = np.cumsum(watch_times_dist)
print(watch_times_acc)

data_acc = Scatter(x = [i for i in range(0, 1200)], y = 100 * watch_times_acc / user_num, fill = "tozeroy")
iplot([data_acc], filename='basic-line')

[  2.00000000e+00   2.39200000e+03   3.99600000e+03 ...,   7.35170000e+04
   7.35170000e+04   7.35170000e+04]
