# MLP Modeling
새로운 유저, 새로운 콘텐츠에 대해서만 예측해야 하기 때문에 classic CF는 못쓴다. 특징 추출하고 mlp 예측 가자

시나리오
1. feature engineering & selection
2. mlp 학습
3. hyperparameter tuning
4. mlp 예측
5. 결과 제출

In [39]:
import os
from datetime import datetime as dt

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import optuna

## load data

In [4]:
train_df = pd.read_csv("open/train.csv", index_col='ID')
test_df = pd.read_csv("open/test.csv", index_col='ID')
sample_submission_df = pd.read_csv("open/sample_submission.csv")

train_df.shape, test_df.shape, sample_submission_df.shape

((871393, 9), (159621, 8), (159621, 2))

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 871393 entries, TRAIN_000000 to TRAIN_871392
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   User-ID              871393 non-null  object 
 1   Book-ID              871393 non-null  object 
 2   Book-Rating          871393 non-null  int64  
 3   Age                  871393 non-null  float64
 4   Location             871393 non-null  object 
 5   Book-Title           871393 non-null  object 
 6   Book-Author          871393 non-null  object 
 7   Year-Of-Publication  871393 non-null  float64
 8   Publisher            871393 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 66.5+ MB


## preprocess data

In [6]:
train_users = train_df['User-ID'].unique()
train_books = train_df['Book-ID'].unique()

len(train_users), len(train_books)

(83256, 243441)

In [7]:
test_users = test_df['User-ID'].unique()
test_books = test_df['Book-ID'].unique()

len(test_users), len(test_books)

(21909, 62333)

In [8]:
len(set(train_users) | set(test_users)), len(set(train_books) | set(test_books))

(92101, 270056)

새로운 유저, 새로운 책이 분명 있음

In [9]:
train_df.head()

Unnamed: 0_level_0,User-ID,Book-ID,Book-Rating,Age,Location,Book-Title,Book-Author,Year-Of-Publication,Publisher
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TRAIN_000000,USER_00000,BOOK_044368,8,23.0,"sackville, new brunswick, canada",Road Taken,Rona Jaffe,2001.0,Mira
TRAIN_000001,USER_00000,BOOK_081205,8,23.0,"sackville, new brunswick, canada",Macbeth (New Penguin Shakespeare),William Shakespeare,1981.0,Penguin Books
TRAIN_000002,USER_00000,BOOK_086781,0,23.0,"sackville, new brunswick, canada",Waverley (Penguin English Library),Walter Scott,1981.0,Penguin Books
TRAIN_000003,USER_00000,BOOK_098622,0,23.0,"sackville, new brunswick, canada",Mother Earth Father Sky,Sue Harrison,1991.0,Avon
TRAIN_000004,USER_00000,BOOK_180810,8,23.0,"sackville, new brunswick, canada",She Who Remembers,Linda Lay Shuler,1989.0,Signet Book


User feature
 * Age: int
 * Location: string

Book feature
 * Book-Title: string
 * Book-Author: string
 * Year-Of-Pulication: int
 * Publisher: string

특징의 분해 이전에 label 형태로도 해볼 수 있음

# Feature Selection

In [10]:
features = test_df.columns
features

Index(['User-ID', 'Book-ID', 'Age', 'Location', 'Book-Title', 'Book-Author',
       'Year-Of-Publication', 'Publisher'],
      dtype='object')

In [11]:
continuous_features = ['Age', 'Year-Of-Publication']
categorical_features = ['Location', 'Book-Title', 'Book-Author', 'Publisher']

In [12]:
train_df['Location'] = train_df['Location'].str.replace(' ', '')
train_df['Location_country'] = train_df['Location'].apply(lambda x: x.split(',')[-1])
train_df['Location_city'] = train_df['Location'].apply(lambda x: x.split(',')[-2])
train_df.drop(columns='Location', inplace=True)
train_df.head()

Unnamed: 0_level_0,User-ID,Book-ID,Book-Rating,Age,Book-Title,Book-Author,Year-Of-Publication,Publisher,Location_country,Location_city
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
TRAIN_000000,USER_00000,BOOK_044368,8,23.0,Road Taken,Rona Jaffe,2001.0,Mira,canada,newbrunswick
TRAIN_000001,USER_00000,BOOK_081205,8,23.0,Macbeth (New Penguin Shakespeare),William Shakespeare,1981.0,Penguin Books,canada,newbrunswick
TRAIN_000002,USER_00000,BOOK_086781,0,23.0,Waverley (Penguin English Library),Walter Scott,1981.0,Penguin Books,canada,newbrunswick
TRAIN_000003,USER_00000,BOOK_098622,0,23.0,Mother Earth Father Sky,Sue Harrison,1991.0,Avon,canada,newbrunswick
TRAIN_000004,USER_00000,BOOK_180810,8,23.0,She Who Remembers,Linda Lay Shuler,1989.0,Signet Book,canada,newbrunswick


In [13]:
test_df['Location'] = test_df['Location'].str.replace(' ', '')
test_df['Location_country'] = test_df['Location'].apply(lambda x: x.split(',')[-1])
test_df['Location_city'] = test_df['Location'].apply(lambda x: x.split(',')[-2])
test_df.drop(columns='Location', inplace=True)
test_df.head()

Unnamed: 0_level_0,User-ID,Book-ID,Age,Book-Title,Book-Author,Year-Of-Publication,Publisher,Location_country,Location_city
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TEST_000000,USER_00008,BOOK_047966,37.0,Birds of Prey: A Novel of Suspense,J.A. Jance,2002.0,Avon,usa,ohio
TEST_000001,USER_00008,BOOK_119494,37.0,Midnight Voices,JOHN SAUL,2003.0,Ballantine Books,usa,ohio
TEST_000002,USER_00008,BOOK_151775,37.0,Breaking Free : A Prescription for Personal an...,David M. Noer,1996.0,Jossey-Bass,usa,ohio
TEST_000003,USER_00008,BOOK_176255,37.0,Bitter Harvest,Ann Rule,1999.0,Pocket,usa,ohio
TEST_000004,USER_00008,BOOK_187307,37.0,Embraced by the Light,Betty J. Eadie,1994.0,Bantam Books,usa,ohio


# encoding

for location_country, publisher

In [14]:
from sklearn.preprocessing import OneHotEncoder

In [15]:
cont_encoder = OneHotEncoder(handle_unknown='ignore')
train_country_array = cont_encoder.fit_transform(train_df[['Location_country']]).toarray()
test_country_array = cont_encoder.transform(test_df[['Location_country']]).toarray()

In [16]:
# pub_encoder = OneHotEncoder(handle_unknown='ignore')
# train_publisher_array = pub_encoder.fit_transform(train_df[['Publisher']]).toarray()
# test_publisher_array = pub_encoder.transform(train_df[['Publisher']]).toarray()

# scaling

In [17]:
from sklearn.preprocessing import StandardScaler

In [18]:
scaler = StandardScaler()
train_cont_array = scaler.fit_transform(train_df[continuous_features])
test_cont_array = scaler.transform(test_df[continuous_features])

# merge prcessed data

In [19]:
train_cont_array.shape, train_country_array.shape#, train_publisher_array.shape

((871393, 2), (871393, 243))

In [20]:
train_x = np.concatenate([train_cont_array, train_country_array], axis=1) #train_publisher_array
test_x = np.concatenate([test_cont_array, test_country_array], axis=1) #test_publisher_array

train_x.shape, test_x.shape

((871393, 245), (159621, 245))

In [21]:
train_y = train_df['Book-Rating'].values
train_y.shape

(871393,)

In [22]:
# train_array = np.concatenate([train_cont_array, train_country_array, train_publisher_array], axis=1)
# test_array = np.concatenate([test_cont_array, test_country_array, test_publisher_array], axis=1)

## first Trial

In [23]:
mname = 'mlpregressor'

In [24]:
from sklearn.neural_network import MLPRegressor

In [25]:
mlp = MLPRegressor()
mlp.fit(train_x, train_y)



In [34]:
train_pred = mlp.predict(train_x)
train_pred

array([3.24132395, 2.98105205, 2.98105205, ..., 3.4007923 , 2.23867367,
       2.96194159])

In [35]:
train_pred.shape

(871393,)

In [38]:
test_pred = mlp.predict(test_x)
test_pred

array([2.53896857, 2.56751312, 2.36770129, ..., 3.19584419, 3.31259461,
       3.40015742])

In [40]:
sample_submission_df = pd.read_csv("open/sample_submission.csv")
sample_submission_df

Unnamed: 0,ID,Book-Rating
0,TEST_000000,0
1,TEST_000001,0
2,TEST_000002,0
3,TEST_000003,0
4,TEST_000004,0
...,...,...
159616,TEST_159616,0
159617,TEST_159617,0
159618,TEST_159618,0
159619,TEST_159619,0


In [41]:
def make_report(template, test_pred, mname):
    template['Book-Rating'] = test_pred
    now = dt.strftime(dt.now(), '%y-%m-%d')
    template.to_csv(f'results/{mname}-{now}.csv', index=False)
    
make_report(sample_submission_df, test_pred, mname)

In [36]:
from sklearn.metrics import mean_squared_error

In [37]:
rmse = mean_squared_error(train_y, train_pred, squared=False)
rmse

3.7931417675775236

test_rmse = 3.8528787988