<a href="https://colab.research.google.com/github/tak34/atmacup-15/blob/main/FE3_user_attribution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 概要

ユーザーがレーティングしている作品を表に集約し、その選択した作品から、ユーザーの属性を示す特徴量をSVDで抽出する。

In [None]:
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import numpy as np
import pandas as pd
import seaborn as sns
import datetime as dt
from pathlib import Path
import warnings
import os
import random
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tqdm.notebook import tqdm
from collections import deque
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer

warnings.simplefilter('ignore')

In [None]:
# Config
SAVE = True
SAVE_DIR = Path("/content/drive/MyDrive/Kaggle/atmacup#15/proc/fe")
SAVE_FILE_NAME = "FE3_user_attribution_20230717"

# データ読み込み

In [None]:
train = pd.read_csv("/content/drive/MyDrive/Kaggle/atmacup#15/raw/train.csv")
test = pd.read_csv("/content/drive/MyDrive/Kaggle/atmacup#15/raw/test.csv")

# 特徴抽出

In [None]:
train_test = pd.concat((
    train[["user_id", "anime_id"]],
    test
)).reset_index(drop=True)
train_test

Unnamed: 0,user_id,anime_id
0,0008e10fb39e55447333,0669cc0219d468761195
1,0008e10fb39e55447333,111adb8835b8a1a2cf54
2,0008e10fb39e55447333,1fc8683c393432a2f9c7
3,0008e10fb39e55447333,2290175205d55e81b197
4,0008e10fb39e55447333,28f173b60331d5cabb0d
...,...,...
254072,ffe85a36cd20500faa58,f508b02efeac8ecb8cc0
254073,ffe85a36cd20500faa58,f5b8ecea3beea4b82d79
254074,ffe85a36cd20500faa58,f6c208226b6b69948053
254075,ffe85a36cd20500faa58,fe67592c312fc1e17745


In [None]:
# anime_idをラベルエンコーディングする
anime_id_dict = {}
for i, id in enumerate(train_test["anime_id"].unique()):
    anime_id_dict[id] = i
train_test["label_anime_id"] = train_test["anime_id"].map(anime_id_dict)
train_test.head()

Unnamed: 0,user_id,anime_id,label_anime_id
0,0008e10fb39e55447333,0669cc0219d468761195,0
1,0008e10fb39e55447333,111adb8835b8a1a2cf54,1
2,0008e10fb39e55447333,1fc8683c393432a2f9c7,2
3,0008e10fb39e55447333,2290175205d55e81b197,3
4,0008e10fb39e55447333,28f173b60331d5cabb0d,4


In [None]:
# アニメをone-hotする
train_test = pd.get_dummies(train_test, columns=["label_anime_id"])
print(train_test.shape)


(254077, 1958)


In [None]:
train_test.head()

Unnamed: 0,user_id,anime_id,label_anime_id_0,label_anime_id_1,label_anime_id_2,label_anime_id_3,label_anime_id_4,label_anime_id_5,label_anime_id_6,label_anime_id_7,...,label_anime_id_1946,label_anime_id_1947,label_anime_id_1948,label_anime_id_1949,label_anime_id_1950,label_anime_id_1951,label_anime_id_1952,label_anime_id_1953,label_anime_id_1954,label_anime_id_1955
0,0008e10fb39e55447333,0669cc0219d468761195,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0008e10fb39e55447333,111adb8835b8a1a2cf54,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0008e10fb39e55447333,1fc8683c393432a2f9c7,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0008e10fb39e55447333,2290175205d55e81b197,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0008e10fb39e55447333,28f173b60331d5cabb0d,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
train_test.drop("anime_id", axis=1, inplace=True)
train_test.head(2)

Unnamed: 0,user_id,label_anime_id_0,label_anime_id_1,label_anime_id_2,label_anime_id_3,label_anime_id_4,label_anime_id_5,label_anime_id_6,label_anime_id_7,label_anime_id_8,...,label_anime_id_1946,label_anime_id_1947,label_anime_id_1948,label_anime_id_1949,label_anime_id_1950,label_anime_id_1951,label_anime_id_1952,label_anime_id_1953,label_anime_id_1954,label_anime_id_1955
0,0008e10fb39e55447333,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0008e10fb39e55447333,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
train_test_grouped = train_test.groupby("user_id").sum()
train_test_grouped

Unnamed: 0_level_0,label_anime_id_0,label_anime_id_1,label_anime_id_2,label_anime_id_3,label_anime_id_4,label_anime_id_5,label_anime_id_6,label_anime_id_7,label_anime_id_8,label_anime_id_9,...,label_anime_id_1946,label_anime_id_1947,label_anime_id_1948,label_anime_id_1949,label_anime_id_1950,label_anime_id_1951,label_anime_id_1952,label_anime_id_1953,label_anime_id_1954,label_anime_id_1955
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0008e10fb39e55447333,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
001a7aed2546342e2602,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
003d4b0257cc7849ffe1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
0054e700b5be6e074fb7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0059344eed7e8ca0b6c5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
feef23df0d53eec7d697,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ff441af085c3522f62ba,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
ff5e8e9e3553b90f222a,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
ffa6ff8006f8630f3d11,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# SVD使う
n_components = 10
svd = TruncatedSVD(n_components=n_components, random_state=0)
svd_arr = svd.fit_transform(train_test_grouped.values)
col_df = pd.DataFrame(
    svd_arr,
    columns=[f"svd_userAttribution_{ix}" for ix in range(n_components)]
)
col_df.index = train_test_grouped.index
col_df.reset_index(inplace=True)
col_df

Unnamed: 0,user_id,svd_userAttribution_0,svd_userAttribution_1,svd_userAttribution_2,svd_userAttribution_3,svd_userAttribution_4,svd_userAttribution_5,svd_userAttribution_6,svd_userAttribution_7,svd_userAttribution_8,svd_userAttribution_9
0,0008e10fb39e55447333,2.320282,0.215112,-2.137685,-0.504110,-1.509799,-0.832248,-0.025398,0.707080,-0.225723,0.032335
1,001a7aed2546342e2602,7.526772,1.210042,1.322600,0.617609,5.943375,-0.345952,-0.460419,2.315124,2.271990,0.105711
2,003d4b0257cc7849ffe1,1.475604,1.798848,-0.770570,-0.969779,0.915891,1.746492,-0.031043,0.312903,0.977097,0.192619
3,0054e700b5be6e074fb7,0.587430,0.515717,-0.026395,0.288477,-0.007523,0.077417,-0.408697,-0.187947,-0.325125,0.030727
4,0059344eed7e8ca0b6c5,0.474244,0.440618,0.465711,0.345016,-0.124904,0.097517,-0.120966,-0.088613,0.095710,0.044116
...,...,...,...,...,...,...,...,...,...,...,...
1993,feef23df0d53eec7d697,0.207593,-0.033800,0.068533,0.128863,0.021088,-0.035622,-0.135301,-0.015812,0.244022,0.164896
1994,ff441af085c3522f62ba,4.715184,4.069717,-0.592703,-0.357364,1.365570,1.326627,-0.901827,0.555564,0.885439,1.401490
1995,ff5e8e9e3553b90f222a,1.805885,0.964629,-0.908834,-0.451100,0.597410,-0.283947,-0.587750,-0.436198,1.379999,0.705076
1996,ffa6ff8006f8630f3d11,2.680082,0.508787,-1.017213,-0.513164,2.263566,1.659873,-0.390212,-0.254580,1.115810,-0.464804


# 保存

In [None]:
if SAVE:
    col_df.to_pickle(SAVE_DIR / f"{SAVE_FILE_NAME}.pkl")

In [None]:
col_df.columns.to_list()

['user_id',
 'svd_userAttribution_0',
 'svd_userAttribution_1',
 'svd_userAttribution_2',
 'svd_userAttribution_3',
 'svd_userAttribution_4',
 'svd_userAttribution_5',
 'svd_userAttribution_6',
 'svd_userAttribution_7',
 'svd_userAttribution_8',
 'svd_userAttribution_9']