# Data Preprocessing

In [None]:
from os import path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set()

## Load Data Files

In [None]:
data_path = "../wechat_algo_data1/"
output_path = "../predictions/"

In [None]:
# Training input data
feed_info = pd.read_csv(path.join(data_path, 'feed_info.csv'))
user_action = pd.read_csv(path.join(data_path, 'user_action.csv'))

In [None]:
# Test data
# testing_datafile = 'test_a.csv'   # Practice dataset
testing_datafile = 'test_b.csv'   # Actual dataset
test_data = pd.read_csv(path.join(data_path, testing_datafile))

In [None]:
print("len(feed_info) = %d\nlen(user_action) = %d\nlen(test_data) = %d" \
      % (len(feed_info), len(user_action), len(test_data)))

## Data Cleaning

### Impute Missing Values for 'bgm_song_id' and 'bgm_singer_id'

In [None]:
# The strategy here is to replace all the NaNs by random samples from
# the set of non-NaN 'bgm_song_id' and 'bgm_singer_id'
NaN_bgm_idx = feed_info[(feed_info['bgm_song_id'].isna()) | (feed_info['bgm_singer_id'].isna())].index
notNaN_bgm = feed_info[(~feed_info['bgm_song_id'].isna()) & (~feed_info['bgm_singer_id'].isna())]
notNaN_bgm = notNaN_bgm[['bgm_song_id', 'bgm_singer_id']]
imputed_bgm = notNaN_bgm.sample(len(feed_info) - len(notNaN_bgm))
feed_info.loc[NaN_bgm_idx, 'bgm_song_id'] = imputed_bgm['bgm_song_id'].values
feed_info.loc[NaN_bgm_idx, 'bgm_singer_id'] = imputed_bgm['bgm_singer_id'].values

In [None]:
plt.hist(feed_info['bgm_song_id']);

In [None]:
plt.hist(feed_info['bgm_singer_id']);

### Merging Datasets

In [None]:
# Feature selection
feed_info = feed_info[[
    'feedid', 
    'authorid', 
    'videoplayseconds', 
    'bgm_song_id', 
    'bgm_singer_id'
]]
user_action = user_action[[
    'userid', 'feedid', 'date_', 'device', 'play', 'stay', 
    # 'comment', 'follow', 'favorite',  
    'read_comment', 'like', 'click_avatar', 'forward'
]]

In [None]:
# Training data
merged_data = user_action.merge(feed_info, on = ['feedid'])
# sort records by date
merged_data = merged_data.sort_values('date_').reset_index(drop = True)
merged_data.head()

In [None]:
# Testing data
X_test = test_data.merge(feed_info, on = ['feedid'])
X_test.head()

### Predict Values of 'play' and 'stay' for Testing Dataset

To make use of the 'play' and 'stay' attributes in the training dataset, we need to impute the values for these two features in the testing dataset. The strategy here is to turn it into a regression problem.

In [None]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb

In [None]:
X_play_stay = merged_data[X_test.columns.tolist()]
y_play_stay = merged_data[['play', 'stay']]

In [None]:
# model = MultiOutputRegressor(GradientBoostingRegressor(
#     n_estimators = 600, 
#     subsample = 0.95, 
#     max_depth = 11, 
#     tol = 1e-5, 
#     warm_start = True
# ), n_jobs = -1)

In [None]:
model = MultiOutputRegressor(xgb.XGBRegressor(
    n_estimators = 1500, 
    max_depth = 12, 
    learning_rate = 0.1, 
    subsample = 0.95, 
    colsample_bytree = 0.95, 
    tree_method = 'gpu_hist', 
    n_jobs = -1
), n_jobs = -1)

In [None]:
%time model.fit(X_play_stay, y_play_stay)

In [None]:
model.score(X_play_stay, y_play_stay)

In [None]:
play_stay_predicted = pd.DataFrame(
    model.predict(X_test), 
    columns = ['play', 'stay']
)
play_stay_predicted = play_stay_predicted.astype(int)
play_stay_predicted

In [None]:
X_test = pd.concat([X_test, play_stay_predicted], axis = 1)

## Generate Training and Testing Datasets

In [None]:
X_train = merged_data.drop(columns = ['date_', 'read_comment', 'like', 'click_avatar', 'forward'])
y_train = merged_data[['read_comment', 'like', 'click_avatar', 'forward']]
X_train

In [None]:
# Important!!
# Make sure the order of features in the testing dataset is consistent 
# with that in the training dataset
X_test = X_test[X_train.columns.tolist()]
X_test

In [None]:
# Save datasets
X_train.to_csv(path.join(data_path, '1.1_X_train.csv'), index = False)
y_train.to_csv(path.join(data_path, '1.1_y_train.csv'), index = False)
X_test.to_csv(path.join(data_path, '1.1_X_test.csv'), index = False)