In [1]:
import os

import numpy as np
import pandas as pd
import sklearn.preprocessing

# Preparation

In [99]:
ORIGINAL_DIR = "../original_data"
ORIGINAL_FRAPPE_DIR = os.path.join(ORIGINAL_DIR, "frappe")
ORIGINAL_MLTAG_DIR = os.path.join(ORIGINAL_DIR, "ml-tag")
ORIGINAL_MEETUP_DIR = os.path.join(ORIGINAL_DIR, "meetup")

OUT_DIR = "../data"
OUT_FRAPPE_DIR = os.path.join(OUT_DIR, "frappe")
OUT_MLTAG_DIR = os.path.join(OUT_DIR, "ml-tag")
OUT_MEETUP_DIR = os.path.join(OUT_DIR, "meetup")

In [3]:
def cumulative_begin(labelencoders, *labels):
    begin = 0
    for label in labels:
        begin += labelencoders[label].classes_.shape[0]
    return begin


In [4]:
def cumulative_count(counts, *labels):
    begin = 0
    for label in labels:
        begin += counts[label]
    return begin


In [6]:
def libfm_names(names):
    result = ["value"]
    for name in names:
        result.append(name)
        result.append(name + "_value")
    return result


# Frappe dataset

## Load originial dataset and detect column names

In [35]:
frappe_user_feature_columns = ["user", "daytime", "weekday", "isweekend", "homework", "weather", "country", "city"]
frappe_item_feature_columns = ["item", "cost"]
frappe_names = ["user", "item", "daytime", "weekday", "isweekend", "homework", "cost", "weather", "country", "city"]

print(frappe_user_feature_columns)
print(frappe_item_feature_columns)
print(frappe_names)

['user', 'daytime', 'weekday', 'isweekend', 'homework', 'weather', 'country', 'city']
['item', 'cost']
['user', 'item', 'daytime', 'weekday', 'isweekend', 'homework', 'cost', 'weather', 'country', 'city']


In [36]:
frappe_libfm_names = libfm_names(frappe_names)
frappe_libfm_names

['value',
 'user',
 'user_value',
 'item',
 'item_value',
 'daytime',
 'daytime_value',
 'weekday',
 'weekday_value',
 'isweekend',
 'isweekend_value',
 'homework',
 'homework_value',
 'cost',
 'cost_value',
 'weather',
 'weather_value',
 'country',
 'country_value',
 'city',
 'city_value']

In [37]:
df_frappe_train = pd.read_csv(os.path.join(ORIGINAL_FRAPPE_DIR, "frappe.train.libfm"),
                                 sep="[ :]", engine="python",
                                 header=None, names=frappe_libfm_names)
print(df_frappe_train.shape)
df_frappe_train.head()

(202027, 21)


Unnamed: 0,value,user,user_value,item,item_value,daytime,daytime_value,weekday,weekday_value,isweekend,...,homework,homework_value,cost,cost_value,weather,weather_value,country,country_value,city,city_value
0,-1,451,1,4149,1,5041,1,5046,1,5053,...,5055,1,5058,1,5060,1,5069,1,5149,1
1,-1,91,1,3503,1,5041,1,5047,1,5053,...,5056,1,5058,1,5065,1,5095,1,5149,1
2,1,168,1,983,1,5040,1,5050,1,5054,...,5055,1,5058,1,5060,1,5069,1,5207,1
3,-1,620,1,1743,1,5045,1,5051,1,5054,...,5055,1,5058,1,5061,1,5073,1,5149,1
4,-1,46,1,2692,1,5040,1,5049,1,5054,...,5055,1,5058,1,5060,1,5086,1,5211,1


In [38]:
df_frappe_validation = pd.read_csv(os.path.join(ORIGINAL_FRAPPE_DIR, "frappe.validation.libfm"),
                                 sep="[ :]", engine="python",
                                 header=None, names=frappe_libfm_names)
print(df_frappe_validation.shape)
df_frappe_validation.head()

(57722, 21)


Unnamed: 0,value,user,user_value,item,item_value,daytime,daytime_value,weekday,weekday_value,isweekend,...,homework,homework_value,cost,cost_value,weather,weather_value,country,country_value,city,city_value
0,1,266,1,1244,1,5042,1,5049,1,5054,...,5055,1,5058,1,5062,1,5074,1,5149,1
1,-1,138,1,1296,1,5041,1,5052,1,5053,...,5055,1,5058,1,5061,1,5085,1,5201,1
2,-1,317,1,1694,1,5041,1,5047,1,5053,...,5055,1,5058,1,5061,1,5070,1,5150,1
3,-1,14,1,2659,1,5043,1,5052,1,5054,...,5055,1,5058,1,5064,1,5076,1,5205,1
4,1,123,1,4068,1,5043,1,5050,1,5054,...,5056,1,5058,1,5060,1,5069,1,5149,1


In [39]:
df_frappe_test = pd.read_csv(os.path.join(ORIGINAL_FRAPPE_DIR, "frappe.test.libfm"),
                                 sep="[ :]", engine="python",
                                 header=None, names=frappe_libfm_names)
print(df_frappe_test.shape)
df_frappe_test.head()

(28860, 21)


Unnamed: 0,value,user,user_value,item,item_value,daytime,daytime_value,weekday,weekday_value,isweekend,...,homework,homework_value,cost,cost_value,weather,weather_value,country,country_value,city,city_value
0,-1,204,1,4798,1,5041,1,5046,1,5053,...,5055,1,5058,1,5060,1,5073,1,5183,1
1,1,42,1,1572,1,5042,1,5047,1,5053,...,5055,1,5058,1,5060,1,5070,1,5150,1
2,1,282,1,2552,1,5044,1,5052,1,5054,...,5055,1,5058,1,5060,1,5072,1,5244,1
3,-1,215,1,1402,1,5039,1,5051,1,5054,...,5055,1,5058,1,5063,1,5069,1,5149,1
4,-1,346,1,2423,1,5043,1,5051,1,5054,...,5055,1,5058,1,5063,1,5088,1,5149,1


### Concatenate all subsets

In [40]:
df_frappe = pd.concat([df_frappe_train, df_frappe_validation, df_frappe_test])
print(df_frappe.shape)
df_frappe.head()

(288609, 21)


Unnamed: 0,value,user,user_value,item,item_value,daytime,daytime_value,weekday,weekday_value,isweekend,...,homework,homework_value,cost,cost_value,weather,weather_value,country,country_value,city,city_value
0,-1,451,1,4149,1,5041,1,5046,1,5053,...,5055,1,5058,1,5060,1,5069,1,5149,1
1,-1,91,1,3503,1,5041,1,5047,1,5053,...,5056,1,5058,1,5065,1,5095,1,5149,1
2,1,168,1,983,1,5040,1,5050,1,5054,...,5055,1,5058,1,5060,1,5069,1,5207,1
3,-1,620,1,1743,1,5045,1,5051,1,5054,...,5055,1,5058,1,5061,1,5073,1,5149,1
4,-1,46,1,2692,1,5040,1,5049,1,5054,...,5055,1,5058,1,5060,1,5086,1,5211,1


### Check column name detection

In [41]:
for column in frappe_names:
    print("{0}: {1} items, numbered from {2} to {3}".format(column,
                                                            df_frappe[column].drop_duplicates().count(),
                                                            df_frappe[column].min(),
                                                            df_frappe[column].max()))

user: 957 items, numbered from 0 to 956
item: 4082 items, numbered from 957 to 5038
daytime: 7 items, numbered from 5039 to 5045
weekday: 7 items, numbered from 5046 to 5052
isweekend: 2 items, numbered from 5053 to 5054
homework: 3 items, numbered from 5055 to 5057
cost: 2 items, numbered from 5058 to 5059
weather: 9 items, numbered from 5060 to 5068
country: 80 items, numbered from 5069 to 5148
city: 233 items, numbered from 5149 to 5381


### Check column value

In [48]:
for column in ["value"] + ["{0}_value".format(n) for n in frappe_names]:
    print(df_frappe[column].value_counts())

-1    192406
 1     96203
Name: value, dtype: int64
1    288609
Name: user_value, dtype: int64
1    288609
Name: item_value, dtype: int64
1    288609
Name: daytime_value, dtype: int64
1    288609
Name: weekday_value, dtype: int64
1    288609
Name: isweekend_value, dtype: int64
1    288609
Name: homework_value, dtype: int64
1    288609
Name: cost_value, dtype: int64
1    288609
Name: weather_value, dtype: int64
1    288609
Name: country_value, dtype: int64
1    288609
Name: city_value, dtype: int64


### Check user/item feature combination

In [42]:
df_frappe["user"].drop_duplicates().count()

957

In [43]:
df_frappe[frappe_user_feature_columns].drop_duplicates().count()

user         29770
daytime      29770
weekday      29770
isweekend    29770
homework     29770
weather      29770
country      29770
city         29770
dtype: int64

In [44]:
df_frappe["item"].drop_duplicates().count()

4082

In [45]:
df_frappe[frappe_item_feature_columns].drop_duplicates().count()

item    7836
cost    7836
dtype: int64

### Check negative sample distribution

In [104]:
print(df_frappe_train["value"].value_counts())
print(df_frappe_validation["value"].value_counts())
print(df_frappe_test["value"].value_counts())

-1    134423
 1     67604
Name: value, dtype: int64
-1    38659
 1    19063
Name: value, dtype: int64
-1    19324
 1     9536
Name: value, dtype: int64


## Re-label features

In [46]:
frappe_labelencoders = dict()
for column in frappe_names:
    frappe_labelencoders[column] = sklearn.preprocessing.LabelEncoder().fit(df_frappe[column])
    print("{0}: {1} items".format(column, frappe_labelencoders[column].classes_.shape))


user: (957,) items
item: (4082,) items
daytime: (7,) items
weekday: (7,) items
isweekend: (2,) items
homework: (3,) items
cost: (2,) items
weather: (9,) items
country: (80,) items
city: (233,) items


In [50]:
relabeled_frappe_train_data = {"value": df_frappe_train["value"]}
for column in frappe_names:
    relabeled_frappe_train_data[column] = frappe_labelencoders[column].transform(df_frappe_train[column])
relabeled_frappe_train_columns = ["value"] + frappe_names

relabeled_frappe_train = pd.DataFrame(relabeled_frappe_train_data, columns=relabeled_frappe_train_columns)
print(relabeled_frappe_train.shape)
relabeled_frappe_train.head()

(202027, 11)


Unnamed: 0,value,user,item,daytime,weekday,isweekend,homework,cost,weather,country,city
0,-1,451,3192,2,0,0,0,0,0,0,0
1,-1,91,2546,2,1,0,1,0,5,26,0
2,1,168,26,1,4,1,0,0,0,0,58
3,-1,620,786,6,5,1,0,0,1,4,0
4,-1,46,1735,1,3,1,0,0,0,17,62


In [51]:
relabeled_frappe_validation_data = {"value": df_frappe_validation["value"]}
for column in frappe_names:
    relabeled_frappe_validation_data[column] = frappe_labelencoders[column].transform(df_frappe_validation[column])
relabeled_frappe_validation_columns = ["value"] + frappe_names

relabeled_frappe_validation = pd.DataFrame(relabeled_frappe_validation_data, columns=relabeled_frappe_validation_columns)
print(relabeled_frappe_validation.shape)
relabeled_frappe_validation.head()

(57722, 11)


Unnamed: 0,value,user,item,daytime,weekday,isweekend,homework,cost,weather,country,city
0,1,266,287,3,3,1,0,0,2,5,0
1,-1,138,339,2,6,0,0,0,1,16,52
2,-1,317,737,2,1,0,0,0,1,1,1
3,-1,14,1702,4,6,1,0,0,4,7,56
4,1,123,3111,4,4,1,1,0,0,0,0


In [52]:
relabeled_frappe_test_data = {"value": df_frappe_test["value"]}
for column in frappe_names:
    relabeled_frappe_test_data[column] = frappe_labelencoders[column].transform(df_frappe_test[column])
relabeled_frappe_test_columns = ["value"] + frappe_names

relabeled_frappe_test = pd.DataFrame(relabeled_frappe_test_data, columns=relabeled_frappe_test_columns)
print(relabeled_frappe_test.shape)
relabeled_frappe_test.head()

(28860, 11)


Unnamed: 0,value,user,item,daytime,weekday,isweekend,homework,cost,weather,country,city
0,-1,204,3841,2,0,0,0,0,0,4,34
1,1,42,615,3,1,0,0,0,0,1,1
2,1,282,1595,5,6,1,0,0,0,3,95
3,-1,215,445,0,5,1,0,0,3,0,0
4,-1,346,1466,4,5,1,0,0,3,19,0


## Save as DeepCTR-compatible format

In [54]:
relabeled_frappe_train.to_csv(os.path.join(OUT_FRAPPE_DIR, "deepctr2.train.tsv"), sep='\t',
                              index=False)

In [55]:
relabeled_frappe_validation.to_csv(os.path.join(OUT_FRAPPE_DIR, "deepctr2.validation.tsv"), sep='\t',
                             index=False)

In [56]:
relabeled_frappe_test.to_csv(os.path.join(OUT_FRAPPE_DIR, "deepctr2.test.tsv"), sep='\t',
                             index=False)

# ml-tag dataset

## Load originial dataset and detect column names

In [57]:
mltag_names = ["user", "movie", "tag"]

print(mltag_names)

['user', 'movie', 'tag']


In [58]:
mltag_libfm_names = libfm_names(mltag_names)
mltag_libfm_names

['value', 'user', 'user_value', 'movie', 'movie_value', 'tag', 'tag_value']

In [60]:
df_mltag_train = pd.read_csv(os.path.join(ORIGINAL_MLTAG_DIR, "ml-tag.train.libfm"),
                                 sep="[ :]", engine="python",
                                 header=None, names=mltag_libfm_names)
print(df_mltag_train.shape)
df_mltag_train.head()

(1404801, 7)


Unnamed: 0,value,user,user_value,movie,movie_value,tag,tag_value
0,-1.0,84982,1,58,1,39525,1
1,-1.0,87756,1,8757,1,22786,1
2,-1.0,80311,1,14912,1,45676,1
3,1.0,82036,1,84509,1,33556,1
4,-1.0,66867,1,57349,1,15092,1


In [61]:
df_mltag_validation = pd.read_csv(os.path.join(ORIGINAL_MLTAG_DIR, "ml-tag.validation.libfm"),
                                 sep="[ :]", engine="python",
                                 header=None, names=mltag_libfm_names)
print(df_mltag_validation.shape)
df_mltag_validation.head()

(401372, 7)


Unnamed: 0,value,user,user_value,movie,movie_value,tag,tag_value
0,-1.0,86810,1,41468,1,26304,1
1,-1.0,59268,1,13793,1,34649,1
2,-1.0,89633,1,72957,1,22253,1
3,-1.0,74868,1,68875,1,18130,1
4,-1.0,8460,1,7726,1,8777,1


In [65]:
df_mltag_test = pd.read_csv(os.path.join(ORIGINAL_MLTAG_DIR, "ml-tag.test.libfm"),
                                 sep="[ :]", engine="python",
                                 header=None, names=mltag_libfm_names)
print(df_mltag_test.shape)
df_mltag_test.head()

(200686, 7)


Unnamed: 0,value,user,user_value,movie,movie_value,tag,tag_value
0,-1.0,51798,1,2473,1,37583,1
1,-1.0,66335,1,61344,1,29842,1
2,-1.0,89085,1,60033,1,47050,1
3,1.0,61293,1,8073,1,3903,1
4,-1.0,81335,1,56575,1,50067,1


### Concatenate all subsets

In [63]:
df_mltag = pd.concat([df_mltag_train, df_mltag_validation, df_mltag_test])
print(df_mltag.shape)
df_mltag.head()

(2006859, 7)


Unnamed: 0,value,user,user_value,movie,movie_value,tag,tag_value
0,-1.0,84982,1,58,1,39525,1
1,-1.0,87756,1,8757,1,22786,1
2,-1.0,80311,1,14912,1,45676,1
3,1.0,82036,1,84509,1,33556,1
4,-1.0,66867,1,57349,1,15092,1


### Check column name detection

In [64]:
for column in mltag_names:
    print("{0}: {1} items, numbered from {2} to {3}".format(column,
                                                            df_mltag[column].drop_duplicates().count(),
                                                            df_mltag[column].min(),
                                                            df_mltag[column].max()))

user: 17045 items, numbered from 0 to 90444
movie: 23743 items, numbered from 1 to 90438
tag: 49657 items, numbered from 2 to 69148


### Check column value

In [85]:
for column in ["value"] + ["{0}_value".format(n) for n in mltag_names]:
    print(df_mltag[column].value_counts())

-1.0    1337906
 1.0     668953
Name: value, dtype: int64
1    2006859
Name: user_value, dtype: int64
1    2006859
Name: movie_value, dtype: int64
1    2006859
Name: tag_value, dtype: int64


### Check intersection of column value range

In [89]:
mltag_user_part = df_mltag[["user"]].reset_index()

print(mltag_user_part.shape)
mltag_user_part.tail()

(2006859, 2)


Unnamed: 0,index,user
2006854,200681,66066
2006855,200682,82036
2006856,200683,56253
2006857,200684,68854
2006858,200685,88730


In [90]:
mltag_movie_part = df_mltag[["movie"]].reset_index()

print(mltag_movie_part.shape)
mltag_movie_part.tail()

(2006859, 2)


Unnamed: 0,index,movie
2006854,200681,10968
2006855,200682,14359
2006856,200683,56270
2006857,200684,69011
2006858,200685,6975


In [91]:
mltag_tag_part = df_mltag[["tag"]].reset_index()

print(mltag_tag_part.shape)
mltag_tag_part.tail()

(2006859, 2)


Unnamed: 0,index,tag
2006854,200681,6931
2006855,200682,13497
2006856,200683,10653
2006857,200684,10417
2006858,200685,14816


In [92]:
mltag_intersection_user_movie = pd.merge(mltag_user_part, mltag_movie_part,
                                         how="inner", left_on="user", right_on="movie")

print(mltag_intersection_user_movie.shape)
mltag_intersection_user_movie

(0, 4)


Unnamed: 0,index_x,user,index_y,movie


In [93]:
mltag_intersection_user_tag = pd.merge(mltag_user_part, mltag_tag_part,
                                         how="inner", left_on="user", right_on="tag")

print(mltag_intersection_user_tag.shape)
mltag_intersection_user_tag

(0, 4)


Unnamed: 0,index_x,user,index_y,tag


In [94]:
mltag_intersection_movie_tag = pd.merge(mltag_movie_part, mltag_tag_part,
                                         how="inner", left_on="movie", right_on="tag")

print(mltag_intersection_movie_tag.shape)
mltag_intersection_movie_tag

(0, 4)


Unnamed: 0,index_x,movie,index_y,tag


### Check negative sample distribution

In [105]:
print(df_mltag_train["value"].value_counts())
print(df_mltag_validation["value"].value_counts())
print(df_mltag_test["value"].value_counts())

-1.0    936923
 1.0    467878
Name: value, dtype: int64
-1.0    267147
 1.0    134225
Name: value, dtype: int64
-1.0    133836
 1.0     66850
Name: value, dtype: int64


## Re-label features

In [95]:
mltag_labelencoders = dict()
for column in mltag_names:
    mltag_labelencoders[column] = sklearn.preprocessing.LabelEncoder().fit(df_mltag[column])
    print("{0}: {1} items".format(column, mltag_labelencoders[column].classes_.shape))


user: (17045,) items
movie: (23743,) items
tag: (49657,) items


In [96]:
relabeled_mltag_train_data = {"value": df_mltag_train["value"]}
for column in mltag_names:
    relabeled_mltag_train_data[column] = mltag_labelencoders[column].transform(df_mltag_train[column])
relabeled_mltag_train_columns = ["value"] + mltag_names

relabeled_mltag_train = pd.DataFrame(relabeled_mltag_train_data, columns=relabeled_mltag_train_columns)
print(relabeled_mltag_train.shape)
relabeled_mltag_train.head()

(1404801, 4)


Unnamed: 0,value,user,movie,tag
0,-1.0,12787,9,34560
1,-1.0,14883,1000,19989
2,-1.0,11053,1790,39394
3,1.0,12471,22380,29445
4,-1.0,5152,7284,13109


In [97]:
relabeled_mltag_validation_data = {"value": df_mltag_validation["value"]}
for column in mltag_names:
    relabeled_mltag_validation_data[column] = mltag_labelencoders[column].transform(df_mltag_validation[column])
relabeled_mltag_validation_columns = ["value"] + mltag_names

relabeled_mltag_validation = pd.DataFrame(relabeled_mltag_validation_data, columns=relabeled_mltag_validation_columns)
print(relabeled_mltag_validation.shape)
relabeled_mltag_validation.head()

(401372, 4)


Unnamed: 0,value,user,movie,tag
0,-1.0,14183,4546,22935
1,-1.0,2554,1627,30288
2,-1.0,16451,16492,19519
3,-1.0,7144,12993,15770
4,-1.0,36,883,7738


In [98]:
relabeled_mltag_test_data = {"value": df_mltag_test["value"]}
for column in mltag_names:
    relabeled_mltag_test_data[column] = mltag_labelencoders[column].transform(df_mltag_test[column])
relabeled_mltag_test_columns = ["value"] + mltag_names

relabeled_mltag_test = pd.DataFrame(relabeled_mltag_test_data, columns=relabeled_mltag_test_columns)
print(relabeled_mltag_test.shape)
relabeled_mltag_test.head()

(200686, 4)


Unnamed: 0,value,user,movie,tag
0,-1.0,1617,448,32856
1,-1.0,4917,8927,26109
2,-1.0,16003,8345,40544
3,1.0,3120,931,3340
4,-1.0,11880,7075,42837


## Save as DeepCTR-compatible format

In [101]:
relabeled_mltag_train.to_csv(os.path.join(OUT_MLTAG_DIR, "deepctr2.train.tsv"), sep='\t',
                              index=False)

In [102]:
relabeled_mltag_validation.to_csv(os.path.join(OUT_MLTAG_DIR, "deepctr2.validation.tsv"), sep='\t',
                                  index=False)

In [103]:
relabeled_mltag_test.to_csv(os.path.join(OUT_MLTAG_DIR, "deepctr2.test.tsv"), sep='\t',
                            index=False)