# Using Content mean

flow
1. 콘텐츠 별 평점 평균 구하기
2. 레코드별로, 각 콘텐츠의 평균을 구해서 넣기 없으면 global 평균을 활용

In [1]:
import os
from datetime import datetime as dt


import numpy as np
import pandas as pd
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error

## load data

In [2]:
train_df = pd.read_csv("open/train.csv", index_col='ID')
test_df = pd.read_csv("open/test.csv", index_col='ID')
sample_submission_df = pd.read_csv("open/sample_submission.csv")

train_df.shape, test_df.shape, sample_submission_df.shape

((871393, 9), (159621, 8), (159621, 2))

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 871393 entries, TRAIN_000000 to TRAIN_871392
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   User-ID              871393 non-null  object 
 1   Book-ID              871393 non-null  object 
 2   Book-Rating          871393 non-null  int64  
 3   Age                  871393 non-null  float64
 4   Location             871393 non-null  object 
 5   Book-Title           871393 non-null  object 
 6   Book-Author          871393 non-null  object 
 7   Year-Of-Publication  871393 non-null  float64
 8   Publisher            871393 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 66.5+ MB


In [4]:
train_df['Book-Rating'] = train_df['Book-Rating'].astype('float32')

# Get global mean

In [6]:
global_mean = train_df[train_df['Book-Rating']!=0]['Book-Rating'].mean()
global_mean

7.619085

In [9]:
train_pred = np.full((train_df.shape[0],), global_mean)
rmse = mean_squared_error(
    train_df['Book-Rating'].replace(0, global_mean).values, 
    train_pred, squared=False)
rmse

1.1243223

# Get Content mean

In [None]:
# book_mean = train_df[['Book-ID', 'Book-Rating']].groupby(by='Book-ID').mean()
# book_mean.tail()

In [None]:
# book_median = train_df[['Book-ID', 'Book-Rating']].groupby(by='Book-ID').median()
# book_median.tail()

In [10]:
book_mean_nonzero = train_df[['Book-ID', 'Book-Rating']].replace(
    0, np.nan).groupby(by='Book-ID').mean()
book_mean_nonzero.tail()

Unnamed: 0_level_0,Book-Rating
Book-ID,Unnamed: 1_level_1
BOOK_270051,
BOOK_270052,8.25
BOOK_270053,6.333333
BOOK_270054,
BOOK_270055,7.666667


In [11]:
# book_median_nonzero = train_df[['Book-ID', 'Book-Rating']].replace(
#     0, np.nan).groupby(by='Book-ID').median()
# book_median_nonzero.tail()

# Evaluate

In [None]:
# train_pred = np.full((train_df.shape[0],), global_mean)

# for i, row in tqdm(enumerate(train_pred), total=train_pred.shape):
#     train_pred[i] = book_mean_nonzero.loc[train_df.iloc[i]['Book-ID']].values[0]

# rmse = mean_squared_error(
#     train_df['Book-Rating'].values, train_pred, squared=False)
# rmse

In [None]:
# train_pred = np.full((train_df.shape[0],), global_mean)

# for i, row in tqdm(enumerate(train_pred), total=train_pred.shape):
#     train_pred[i] = book_median.loc[train_df.iloc[i]['Book-ID']].values[0]

# rmse = mean_squared_error(
#     train_df['Book-Rating'].values, train_pred, squared=False)
# rmse

In [13]:
np.isnan(np.nan)

True

In [15]:
train_pred = np.full((train_df.shape[0],), 0)

for i, row in tqdm(enumerate(train_pred), total=train_pred.shape[0]):
    value = book_mean_nonzero.loc[train_df.iloc[i]['Book-ID']].values[0]
    if not np.isnan(value):
        train_pred[i] = value

rmse = mean_squared_error(
    train_df['Book-Rating'].values,
    train_pred, squared=False)
rmse

100%|██████████| 871393/871393 [01:49<00:00, 7987.00it/s]


4.923649874683864

In [None]:
# train_pred = np.full((train_df.shape[0],), global_mean)

# for i, row in tqdm(enumerate(train_pred), total=train_pred.shape[0]):
#     train_pred[i] = book_median_nonzero.loc[train_df.iloc[i]['Book-ID']].values[0]

# train_pred = np.where(np.isnan(train_pred), global_mean, train_pred)
# rmse = mean_squared_error(
#     train_df['Book-Rating'].values, train_pred, squared=False)
# rmse

# 가장 낮은 mse의 데이터 저장

In [19]:
test_pred = np.full((test_df.shape[0],), global_mean)

for i, row in tqdm(enumerate(test_pred), total=test_pred.shape[0]):
    book_name = test_df.iloc[i]['Book-ID']
    if book_name in book_mean_nonzero.index:
        value = book_mean_nonzero.loc[book_name].values[0]
        if not np.isnan(value):
            test_pred[i] = value

100%|██████████| 159621/159621 [00:18<00:00, 8585.17it/s]


In [20]:
test_pred

array([7.3846154, 7.6363635, 7.619085 , ..., 7.619085 , 9.       ,
       7.619085 ], dtype=float32)

In [None]:
sample_submission_df = pd.read_csv("open/sample_submission.csv")
sample_submission_df

In [21]:
mname = 'using-content-mean'

In [22]:
def make_report(template, test_pred, mname):
    template['Book-Rating'] = test_pred
    now = dt.strftime(dt.now(), '%y-%m-%d')
    template.to_csv(f'results/{mname}-{now}.csv', index=False)
    
make_report(sample_submission_df, test_pred, mname)

# 제출 결과: test_rmse
- 4.30 (include zeros)
- 6.12 (ignore zeros)

.... 폭망