In [101]:
import numpy as np
import pandas as pd
from collections import OrderedDict, namedtuple
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from utils import SparseFeat, DenseFeat#, build_input_features

In [42]:
class VarLenSparseFeat(namedtuple('VarLenFeat',
                                  ['name', 'dimension', 'maxlen', 'combiner', 'use_hash', 'dtype', 'embedding_name',
                                   'embedding'])):
    __slots__ = ()

    def __new__(cls, name, dimension, maxlen, combiner="mean", use_hash=False, dtype="float32", embedding_name=None,
                embedding=True):
        if embedding_name is None:
            embedding_name = name
        return super(VarLenSparseFeat, cls).__new__(cls, name, dimension, maxlen, combiner, use_hash, dtype,
                                                    embedding_name, embedding)
    
def split(x):
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))


def build_input_features(feature_columns):
    features = OrderedDict()

    start = 0
    for feat in feature_columns:
        feat_name = feat.name
        if feat_name in features:
            continue
        if isinstance(feat, SparseFeat):
            features[feat_name] = (start, start + 1)
            start += 1
        elif isinstance(feat, DenseFeat):
            features[feat_name] = (start, start + feat.dimension)
            start += feat.dimension
        elif isinstance(feat,VarLenSparseFeat):
            features[feat_name] = (start, start + feat.maxlen)
            start += feat.maxlen
        else:
            raise TypeError("Invalid feature column type,got",type(feat))
    return features


def get_feature_names(feature_columns):
    features = build_input_features(feature_columns)
    return list(features.keys())

# ML-Sample

In [43]:
root = '/data/private/Ad/ml-20m/'
file_name = 'movielens_sample.txt'

data = pd.read_csv(root+file_name)
sparse_features = ["movie_id", "user_id",
                   "gender", "age", "occupation", "zip", ]
target = ['rating']

In [44]:
# 1.Label Encoding for sparse features,and process sequence features
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
# preprocess the sequence feature

key2index= {}
genres_list = list(map(split, data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)
# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

In [45]:
# 2.count #unique features for each sparse field and generate feature config for sequence feature

fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
                    for feat in sparse_features]
varlen_feature_columns = [VarLenSparseFeat('genres', len(
    key2index) + 1, max_len, 'mean')]  # Notice : value 0 is for padding for sequence input feature

linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)


In [122]:
# 3.generate input data for model
model_input = {name:data[name] for name in target+feature_names if name!='genres'}
model_input = np.concatenate((np.stack(list(model_input.values()), -1), genres_list), 1)

In [123]:
train, test = train_test_split(model_input, test_size=0.2)

In [131]:
import pickle

with open(root+'np_prepro/dataset_fm.pkl', 'wb') as f:
    pickle.dump(train, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump(test, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump(feature_names, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump((linear_feature_columns, dnn_feature_columns), f, pickle.HIGHEST_PROTOCOL)

In [129]:
train.shape, data.shape, max_len, feature_names

((160, 12),
 (200, 10),
 5,
 ['movie_id', 'user_id', 'gender', 'age', 'occupation', 'zip', 'genres'])

# ML-20M

In [137]:
root = '/data/private/Ad/ml-20m/'

ratings = pd.read_csv(root+'ratings.csv')
movies = pd.read_csv(root+'movies.csv')

sparse_features = ["userId", "movieId"]#, "genres"]
target = ['rating']

data = pd.merge(ratings, movies)
data.loc[:,'rating'] = data['rating'].map(lambda x: 1 if x > 3 else 0)

In [150]:
# 1.Label Encoding for sparse features,and process sequence features
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
# preprocess the sequence feature

key2index= {}
genres_list = list(map(split, data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)
# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

In [154]:
# 2.count #unique features for each sparse field and generate feature config for sequence feature

fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
                    for feat in sparse_features]
varlen_feature_columns = [VarLenSparseFeat('genres', len(
    key2index) + 1, max_len, 'mean')]  # Notice : value 0 is for padding for sequence input feature

linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)


In [155]:
# 3.generate input data for model
model_input = {name:data[name] for name in target+feature_names if name!='genres'}
model_input = np.concatenate((np.stack(list(model_input.values()), -1), genres_list), 1)

In [156]:
train, test = train_test_split(model_input, test_size=0.2)

In [158]:
import pickle

with open(root+'np_prepro/dataset_fm.pkl', 'wb') as f:
    pickle.dump(train, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump(test, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump(feature_names, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump((linear_feature_columns, dnn_feature_columns), f, pickle.HIGHEST_PROTOCOL)