In [36]:
import os

import numpy as np
import pandas as pd
import sklearn.preprocessing

## Preparation

Constant definitions

In [34]:
ORIGINAL_DIR = "../original_data"
ORIGINAL_FRAPPE_DIR = os.path.join(ORIGINAL_DIR, "frappe")
ORIGINAL_LASTFM_DIR = os.path.join(ORIGINAL_DIR, "lastfm")
ORIGINAL_MEETUP_DIR = os.path.join(ORIGINAL_DIR, "meetup")

OUT_DIR = "../data"
OUT_FRAPPE_DIR = os.path.join(OUT_DIR, "frappe")
OUT_LASTFM_DIR = os.path.join(OUT_DIR, "lastfm")
OUT_MEETUP_DIR = os.path.join(OUT_DIR, "meetup")

In [41]:
def cumulative_begin(labelencoders, *labels):
    begin = 0
    for label in labels:
        begin += labelencoders[label].classes_.shape[0]
    return begin


In [114]:
def cumulative_count(counts, *labels):
    begin = 0
    for label in labels:
        begin += counts[label]
    return begin


## Frappe dataset

### Load originial dataset and detect column names

In [125]:
frappe_user_feature_columns = ["user", "daytime", "weekday", "isweekend", "homework", "weather", "country", "city"]
frappe_item_feature_columns = ["item", "cost"]
frappe_names = frappe_user_feature_columns + frappe_item_feature_columns

print(frappe_user_feature_columns)
print(frappe_item_feature_columns)
print(frappe_names)

['user', 'daytime', 'weekday', 'isweekend', 'homework', 'weather', 'country', 'city']
['item', 'cost']
['user', 'daytime', 'weekday', 'isweekend', 'homework', 'weather', 'country', 'city', 'item', 'cost']


In [19]:
df_frappe_train = pd.read_csv(os.path.join(ORIGINAL_FRAPPE_DIR, "train.csv"),
                                 sep="[,-]", engine="python",
                                 header=None, names=frappe_names)
print(df_frappe_train.shape)
df_frappe_train.head()

(92808, 10)


Unnamed: 0,user,daytime,weekday,isweekend,homework,weather,country,city,item,cost
0,187,958,967,972,973,977,993,1071,72,4083
1,195,957,970,972,973,980,1023,1129,49,4082
2,223,959,966,972,973,977,1001,1146,1285,4082
3,468,962,966,972,973,981,1033,1204,16,4082
4,12,961,967,972,973,978,990,1065,24,4082


In [20]:
df_frappe_test = pd.read_csv(os.path.join(ORIGINAL_FRAPPE_DIR, "test.csv"),
                                 sep="[,-]", engine="python",
                                 header=None, names=frappe_names)
print(df_frappe_test.shape)
df_frappe_test.head()

(3387, 10)


Unnamed: 0,user,daytime,weekday,isweekend,homework,weather,country,city,item,cost
0,43,958,966,972,973,976,985,1168,25,4082
1,213,959,965,971,973,977,1003,1065,1196,4082
2,169,961,964,971,973,977,985,1122,136,4082
3,242,957,967,972,973,977,1003,1153,0,4082
4,366,957,966,972,973,977,1036,1196,3759,4082


In [21]:
df_frappe = pd.concat([df_frappe_train, df_frappe_test])
print(df_frappe.shape)
df_frappe.head()

(96195, 10)


Unnamed: 0,user,daytime,weekday,isweekend,homework,weather,country,city,item,cost
0,187,958,967,972,973,977,993,1071,72,4083
1,195,957,970,972,973,980,1023,1129,49,4082
2,223,959,966,972,973,977,1001,1146,1285,4082
3,468,962,966,972,973,981,1033,1204,16,4082
4,12,961,967,972,973,978,990,1065,24,4082


#### Check column name detection

In [23]:
for column in detecting_names:
    print("{0}: {1} items, numbered from {2} to {3}".format(column,
                                                            df_frappe[column].drop_duplicates().count(),
                                                            df_frappe[column].min(),
                                                            df_frappe[column].max()))

user: 957 items, numbered from 0 to 956
daytime: 7 items, numbered from 957 to 963
weekday: 7 items, numbered from 964 to 970
isweekend: 2 items, numbered from 971 to 972
homework: 3 items, numbered from 973 to 975
weather: 9 items, numbered from 976 to 984
country: 80 items, numbered from 985 to 1064
city: 233 items, numbered from 1065 to 1297
item: 4082 items, numbered from 0 to 4081
cost: 2 items, numbered from 4082 to 4083


#### Check user/item feature combination

In [126]:
df_frappe[frappe_user_feature_columns].drop_duplicates().count()

user         29770
daytime      29770
weekday      29770
isweekend    29770
homework     29770
weather      29770
country      29770
city         29770
dtype: int64

In [127]:
df_frappe[frappe_item_feature_columns].drop_duplicates().count()

item    4082
cost    4082
dtype: int64

### Re-label features

In [129]:
frappe_labelencoders = dict()
for column in frappe_names:
    frappe_labelencoders[column] = sklearn.preprocessing.LabelEncoder().fit(df_frappe[column])
    print("{0}: {1} items".format(column, frappe_labelencoders[column].classes_.shape))


user: (957,) items
daytime: (7,) items
weekday: (7,) items
isweekend: (2,) items
homework: (3,) items
weather: (9,) items
country: (80,) items
city: (233,) items
item: (4082,) items
cost: (2,) items


In [130]:
relabeled_frappe_train = pd.DataFrame({column: frappe_labelencoders[column].transform(df_frappe_train[column])
                                      for column in frappe_names}, columns=frappe_names)
print(relabeled_frappe_train.shape)
relabeled_frappe_train.head()

(92808, 10)


Unnamed: 0,user,daytime,weekday,isweekend,homework,weather,country,city,item,cost
0,187,1,3,1,0,1,8,6,72,1
1,195,0,6,1,0,4,38,64,49,0
2,223,2,2,1,0,1,16,81,1285,0
3,468,5,2,1,0,5,48,139,16,0
4,12,4,3,1,0,2,5,0,24,0


In [131]:
relabeled_frappe_test = pd.DataFrame({column: frappe_labelencoders[column].transform(df_frappe_test[column])
                                      for column in frappe_names}, columns=frappe_names)
print(relabeled_frappe_test.shape)
relabeled_frappe_test.head()

(3387, 10)


Unnamed: 0,user,daytime,weekday,isweekend,homework,weather,country,city,item,cost
0,43,1,2,1,0,0,0,103,25,0
1,213,2,1,0,0,1,18,0,1196,0
2,169,4,0,0,0,1,0,57,136,0
3,242,0,3,1,0,1,18,88,0,0
4,366,0,2,1,0,1,51,131,3759,0


### Save as DeepCTR-compatible format

In [59]:
relabled_frappe_train.to_csv(os.path.join(OUT_FRAPPE_DIR, "deepctr.train.tsv"), sep='\t',
                             index=False)

In [60]:
relabled_frappe_test.to_csv(os.path.join(OUT_FRAPPE_DIR, "deepctr.test.tsv"), sep='\t',
                             index=False)

### Save as IFM format

### Save as CFM format

In [44]:
cfm_frappe_train = pd.DataFrame({
    "user": relabled_frappe_train["user"],
    "daytime": relabled_frappe_train["daytime"] + cumulative_begin(labelencoders, "user"),
    "weekday": relabled_frappe_train["weekday"] + cumulative_begin(labelencoders, "user", "daytime"),
    "isweekend": relabled_frappe_train["isweekend"] + cumulative_begin(labelencoders, "user", "daytime", "weekday"),
    "homework": relabled_frappe_train["homework"] + cumulative_begin(labelencoders,
                                                                     "user", "daytime", "weekday", "isweekend"),
    "weather": relabled_frappe_train["weather"] + cumulative_begin(labelencoders,
                                                                   "user", "daytime", "weekday", "isweekend",
                                                                   "homework"),
    "country": relabled_frappe_train["country"] + cumulative_begin(labelencoders,
                                                                   "user", "daytime", "weekday", "isweekend",
                                                                   "homework", "weather"),
    "city": relabled_frappe_train["city"] + cumulative_begin(labelencoders,
                                                             "user", "daytime", "weekday", "isweekend",
                                                             "homework", "weather", "country"),

    "item": relabled_frappe_train["item"],
    "cost": relabled_frappe_train["cost"] + cumulative_begin(labelencoders, "item")
}, columns=frappe_names)
print(cfm_frappe_train.shape)
cfm_frappe_train.head()

(92808, 10)


Unnamed: 0,user,daytime,weekday,isweekend,homework,weather,country,city,item,cost
0,187,958,967,972,973,977,993,1071,72,4083
1,195,957,970,972,973,980,1023,1129,49,4082
2,223,959,966,972,973,977,1001,1146,1285,4082
3,468,962,966,972,973,981,1033,1204,16,4082
4,12,961,967,972,973,978,990,1065,24,4082


In [47]:
cfm_frappe_train.to_csv(os.path.join(OUT_FRAPPE_DIR, "cfm.train.tsv"), sep='\t',
                          index=False)

In [45]:
cfm_frappe_test = pd.DataFrame({
    "user": relabled_frappe_test["user"],
    "daytime": relabled_frappe_test["daytime"] + cumulative_begin(labelencoders, "user"),
    "weekday": relabled_frappe_test["weekday"] + cumulative_begin(labelencoders, "user", "daytime"),
    "isweekend": relabled_frappe_test["isweekend"] + cumulative_begin(labelencoders, "user", "daytime", "weekday"),
    "homework": relabled_frappe_test["homework"] + cumulative_begin(labelencoders,
                                                                     "user", "daytime", "weekday", "isweekend"),
    "weather": relabled_frappe_test["weather"] + cumulative_begin(labelencoders,
                                                                   "user", "daytime", "weekday", "isweekend",
                                                                   "homework"),
    "country": relabled_frappe_test["country"] + cumulative_begin(labelencoders,
                                                                   "user", "daytime", "weekday", "isweekend",
                                                                   "homework", "weather"),
    "city": relabled_frappe_test["city"] + cumulative_begin(labelencoders,
                                                             "user", "daytime", "weekday", "isweekend",
                                                             "homework", "weather", "country"),

    "item": relabled_frappe_test["item"],
    "cost": relabled_frappe_test["cost"] + cumulative_begin(labelencoders, "item")
}, columns=frappe_names)
print(cfm_frappe_test.shape)
cfm_frappe_test.head()

(3387, 10)


Unnamed: 0,user,daytime,weekday,isweekend,homework,weather,country,city,item,cost
0,43,958,966,972,973,976,985,1168,25,4082
1,213,959,965,971,973,977,1003,1065,1196,4082
2,169,961,964,971,973,977,985,1122,136,4082
3,242,957,967,972,973,977,1003,1153,0,4082
4,366,957,966,972,973,977,1036,1196,3759,4082


In [48]:
cfm_frappe_test.to_csv(os.path.join(OUT_FRAPPE_DIR, "cfm.test.tsv"), sep='\t',
                          index=False)

### _Test MultiIndex for negative sampling_

In [141]:
multiindexed_frappe_train = relabeled_frappe_train.set_index(frappe_user_feature_columns).sort_index()
print(multiindexed_frappe_train.shape)
multiindexed_frappe_train.head()

(92808, 2)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,item,cost
user,daytime,weekday,isweekend,homework,weather,country,city,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,0,0,0,0,0,0,26,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,21,0
0,0,2,1,0,0,0,0,1107,0
0,1,5,1,0,1,0,0,25,0


In [142]:
frappe_positive_sample_query = multiindexed_frappe_train.loc[(187, 1, 3, 1, 0, 1, 8, 6), :]
frappe_positive_sample_query

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,item,cost
user,daytime,weekday,isweekend,homework,weather,country,city,Unnamed: 8_level_1,Unnamed: 9_level_1
187,1,3,1,0,1,8,6,72,1
187,1,3,1,0,1,8,6,572,0
187,1,3,1,0,1,8,6,1604,0
187,1,3,1,0,1,8,6,3,0
187,1,3,1,0,1,8,6,31,0
187,1,3,1,0,1,8,...,...,...
187,1,3,1,0,1,8,6,323,0
187,1,3,1,0,1,8,6,1259,0
187,1,3,1,0,1,8,6,16,0
187,1,3,1,0,1,8,6,1030,0


In [143]:
frappe_negative_sample_query = pd.DataFrame(
    [(32, 0)], columns=frappe_item_feature_columns,
    index=pd.MultiIndex.from_tuples([(187, 1, 3, 1, 0, 1, 8, 6)],
                                    names=frappe_user_feature_columns)
)
frappe_negative_sample_query

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,item,cost
user,daytime,weekday,isweekend,homework,weather,country,city,Unnamed: 8_level_1,Unnamed: 9_level_1
187,1,3,1,0,1,8,6,32,0


In [148]:
pd.merge(frappe_negative_sample_query, frappe_positive_sample_query,
         how="inner", on=frappe_item_feature_columns).shape[0] > 0

False

## Last.fm dataset

### Load originial dataset and detect column names

In [122]:
lastfm_user_feature_columns = ["user", "last_listened"]
lastfm_item_feature_columns = ["music", "artist"]
lastfm_names = lastfm_user_feature_columns + lastfm_item_feature_columns + ["response", "timestamp"]

print(lastfm_user_feature_columns)
print(lastfm_item_feature_columns)
print(lastfm_names)

['user', 'last_listened']
['music', 'artist']
['user', 'last_listened', 'music', 'artist', 'response', 'timestamp']


In [27]:
df_lastfm_train = pd.read_csv(os.path.join(ORIGINAL_LASTFM_DIR, "train.csv"),
                              sep="[-,]", engine="python",
                              header=None, names=lastfm_names)
print(df_lastfm_train.shape)
df_lastfm_train.head()

(202309, 6)


Unnamed: 0,user,last_listened,music,artist,response,timestamp
0,1,1000,2,16057,1,1241361476000
1,1,1000,3,16058,1,1241360093000
2,1,1000,4,16059,1,1241359818000
3,1,1000,5,16060,1,1241359471000
4,1,1000,6,16061,1,1241358985000


In [28]:
df_lastfm_test = pd.read_csv(os.path.join(ORIGINAL_LASTFM_DIR, "test.csv"),
                              sep="[-,]", engine="python",
                              header=None, names=lastfm_names)
print(df_lastfm_test.shape)
df_lastfm_test.head()

(12265, 6)


Unnamed: 0,user,last_listened,music,artist,response,timestamp
0,1,1000,1,16057,1,1241362105000
1,2,1009,10,16063,1,1240940495000
2,2,1011,10,16063,1,1240940495000
3,9,1018,19,16066,1,1241225019000
4,9,1020,19,16066,1,1241225019000


In [32]:
df_lastfm = pd.concat([df_lastfm_train[lastfm_names[:-1]], df_lastfm_test[lastfm_names[:-1]]])
print(df_lastfm.shape)
df_lastfm.head()

(214574, 5)


Unnamed: 0,user,last_listened,music,artist,response
0,1,1000,2,16057,1
1,1,1000,3,16058,1
2,1,1000,4,16059,1
3,1,1000,5,16060,1
4,1,1000,6,16061,1


####  Check column name detection

In [33]:
for column in lastfm_names[:-1]:
    print("{0}: {1} items, numbered from {2} to {3}".format(column,
                                                            df_lastfm[column].drop_duplicates().count(),
                                                            df_lastfm[column].min(),
                                                            df_lastfm[column].max()))

user: 414 items, numbered from 1 to 999
last_listened: 11533 items, numbered from 1000 to 17056
music: 14387 items, numbered from 1 to 16055
artist: 4244 items, numbered from 16057 to 20300
response: 1 items, numbered from 1 to 1


#### Check user/item feature combinations

In [123]:
df_lastfm[lastfm_user_feature_columns].drop_duplicates().count()

user             12265
last_listened    12265
dtype: int64

In [124]:
df_lastfm[lastfm_item_feature_columns].drop_duplicates().count()

music     14387
artist    14387
dtype: int64

### Re-label features

In [52]:
lastfm_valid_columns = lastfm_names[:-2]
lastfm_labelencoders = dict()
for column in lastfm_valid_columns:
    lastfm_labelencoders[column] = sklearn.preprocessing.LabelEncoder().fit(df_lastfm[column])
    print("{0}: {1} items".format(column, lastfm_labelencoders[column].classes_.shape))


user: (414,) items
last_listened: (11533,) items
music: (14387,) items
artist: (4244,) items


In [53]:
relabled_lastfm_train = pd.DataFrame({column: lastfm_labelencoders[column].transform(df_lastfm_train[column])
                                      for column in lastfm_valid_columns}, columns=lastfm_valid_columns)
print(relabled_lastfm_train.shape)
relabled_lastfm_train.head()

(202309, 4)


Unnamed: 0,user,last_listened,music,artist
0,0,0,1,0
1,0,0,2,1
2,0,0,3,2
3,0,0,4,3
4,0,0,5,4


In [54]:
relabled_lastfm_test = pd.DataFrame({column: lastfm_labelencoders[column].transform(df_lastfm_test[column])
                                      for column in lastfm_valid_columns}, columns=lastfm_valid_columns)
print(relabled_lastfm_test.shape)
relabled_lastfm_test.head()

(12265, 4)


Unnamed: 0,user,last_listened,music,artist
0,0,0,0,0
1,1,1,8,6
2,1,3,8,6
3,2,5,16,9
4,2,6,16,9


### Save as DeepCTR-compatible format

In [61]:
relabled_lastfm_train.to_csv(os.path.join(OUT_LASTFM_DIR, "deepctr.train.tsv"), sep='\t',
                             index=False)

In [62]:
relabled_lastfm_test.to_csv(os.path.join(OUT_LASTFM_DIR, "deepctr.test.tsv"), sep='\t',
                             index=False)

### Save as IFM format

### Save as CFM format

In [55]:
cfm_lastfm_train = pd.DataFrame({
    "user": relabled_lastfm_train["user"],
    "last_listened": relabled_lastfm_train["last_listened"] + cumulative_begin(lastfm_labelencoders, "user"),

    "music": relabled_lastfm_train["music"],
    "artist": relabled_lastfm_train["artist"] + cumulative_begin(lastfm_labelencoders, "music")
}, columns=lastfm_valid_columns)
print(cfm_lastfm_train.shape)
cfm_lastfm_train.head()

(202309, 4)


Unnamed: 0,user,last_listened,music,artist
0,0,414,1,14387
1,0,414,2,14388
2,0,414,3,14389
3,0,414,4,14390
4,0,414,5,14391


In [56]:
cfm_lastfm_train.to_csv(os.path.join(OUT_LASTFM_DIR, "cfm.train.tsv"), sep='\t',
                          index=False)

In [57]:
cfm_lastfm_test = pd.DataFrame({
    "user": relabled_lastfm_test["user"],
    "last_listened": relabled_lastfm_test["last_listened"] + cumulative_begin(lastfm_labelencoders, "user"),

    "music": relabled_lastfm_test["music"],
    "artist": relabled_lastfm_test["artist"] + cumulative_begin(lastfm_labelencoders, "music")
}, columns=lastfm_valid_columns)
print(cfm_lastfm_test.shape)
cfm_lastfm_test.head()

(12265, 4)


Unnamed: 0,user,last_listened,music,artist
0,0,414,0,14387
1,1,415,8,14393
2,1,417,8,14393
3,2,419,16,14396
4,2,420,16,14396


In [58]:
cfm_lastfm_test.to_csv(os.path.join(OUT_LASTFM_DIR, "cfm.test.tsv"), sep='\t',
                          index=False)

## Meetup (chicago) dataset

### Load original dataset and detect values

In [86]:
df_meetup_rsvp_train = pd.read_csv(os.path.join(ORIGINAL_MEETUP_DIR, "user-event-rsvp_train.tsv"), sep='\t',
                              header=None, names=["_relation_id", "user", "event", "value"],
                              usecols=["user", "event"])
print(df_meetup_rsvp_train.shape)
df_meetup_rsvp_train.head()

(113921, 2)


Unnamed: 0,user,event
0,1470,14
1,1845,14
2,2371,14
3,2425,14
4,2948,14


In [87]:
df_meetup_rsvp_test = pd.read_csv(os.path.join(ORIGINAL_MEETUP_DIR, "user-event-rsvp_test.tsv"), sep='\t',
                              header=None, names=["user", "event"])
print(df_meetup_rsvp_test.shape)
df_meetup_rsvp_test.head()

(28438, 2)


Unnamed: 0,user,event
0,693,287
1,10722,78
2,539,166
3,2166,166
4,4031,166


In [78]:
df_meetup_organizing = pd.read_csv(os.path.join(ORIGINAL_MEETUP_DIR, "group-event-created_train.tsv"), sep='\t',
                                   header=None, names=["_relation_id", "organizer", "event", "value"],
                                   usecols=["organizer", "event"])
print(df_meetup_organizing.shape)
df_meetup_organizing.head()

(41445, 2)


Unnamed: 0,organizer,event
0,448,1
1,692,2
2,692,3
3,692,4
4,692,5


In [80]:
df_meetup_location = pd.read_csv(os.path.join(ORIGINAL_MEETUP_DIR, "event-location-held_train.tsv"), sep='\t',
                                 header=None, names=["_relation_id", "event", "location", "value"],
                                 usecols=["event", "location"])
print(df_meetup_location.shape)
df_meetup_location.head()

(41445, 2)


Unnamed: 0,event,location
0,1,1667
1,2,2466
2,3,2466
3,4,2466
4,5,2466


In [97]:
df_meetup_timeslot = pd.read_csv(os.path.join(ORIGINAL_MEETUP_DIR, "event-time-started_train.tsv"), sep='\t',
                              header=None, names=["_relation_id", "event", "timeslot", "value"],
                                  usecols=["event", "timeslot"])
df_meetup_timeslot["timeslot"] = df_meetup_timeslot["timeslot"].astype(np.int32)
print(df_meetup_timeslot.shape)
df_meetup_timeslot.head()

(41445, 2)


Unnamed: 0,event,timeslot
0,1,9
1,2,129
2,3,81
3,4,129
4,5,81


#### Merge auxiliary information into train/test dataset

In [98]:
df_meetup_train = pd.merge(df_meetup_rsvp_train, df_meetup_organizing, how="inner", on="event")
df_meetup_train = pd.merge(df_meetup_train, df_meetup_location, how="inner", on="event")
df_meetup_train = pd.merge(df_meetup_train, df_meetup_timeslot, how="inner", on="event")
print(df_meetup_train.shape)
df_meetup_train.head()

(113921, 5)


Unnamed: 0,user,event,organizer,location,timeslot
0,1470,14,332,2650,8
1,1845,14,332,2650,8
2,2371,14,332,2650,8
3,2425,14,332,2650,8
4,2948,14,332,2650,8


In [99]:
df_meetup_test = pd.merge(df_meetup_rsvp_test, df_meetup_organizing, how="inner", on="event")
df_meetup_test = pd.merge(df_meetup_test, df_meetup_location, how="inner", on="event")
df_meetup_test = pd.merge(df_meetup_test, df_meetup_timeslot, how="inner", on="event")
print(df_meetup_test.shape)
df_meetup_test.head()

(28438, 5)


Unnamed: 0,user,event,organizer,location,timeslot
0,693,287,270,1411,28
1,1511,287,270,1411,28
2,1640,287,270,1411,28
3,3582,287,270,1411,28
4,10722,78,495,6817,5


#### Merge whole dataset

In [100]:
df_meetup = pd.concat([df_meetup_train, df_meetup_test])
print(df_meetup.shape)
df_meetup.head()

(142359, 5)


Unnamed: 0,user,event,organizer,location,timeslot
0,1470,14,332,2650,8
1,1845,14,332,2650,8
2,2371,14,332,2650,8
3,2425,14,332,2650,8
4,2948,14,332,2650,8


#### Check feature count of each field

In [101]:
for column in df_meetup.columns:
    print("{0}: {1} items, numbered from {2} to {3}".format(column,
                                                            df_meetup[column].drop_duplicates().count(),
                                                            df_meetup[column].min(),
                                                            df_meetup[column].max()))

user: 11233 items, numbered from 1 to 11233
event: 41445 items, numbered from 1 to 41445
organizer: 1683 items, numbered from 1 to 1683
location: 7015 items, numbered from 1 to 7015
timeslot: 147 items, numbered from 1 to 168


### Re-label features

In [110]:
meetup_feature_counts = {column: df_meetup[column].max() for column in df_meetup.columns}
meetup_feature_counts

{'user': 11233,
 'event': 41445,
 'organizer': 1683,
 'location': 7015,
 'timeslot': 168}

In [105]:
relabeled_meetup_train = pd.DataFrame({
    column: df_meetup_train[column] - 1
    for column in df_meetup_train.columns
}, columns=df_meetup_train.columns.tolist())
print(relabeled_meetup_train.shape)
relabeled_meetup_train.head()

(113921, 5)


Unnamed: 0,user,event,organizer,location,timeslot
0,1469,13,331,2649,7
1,1844,13,331,2649,7
2,2370,13,331,2649,7
3,2424,13,331,2649,7
4,2947,13,331,2649,7


In [107]:
relabeled_meetup_test = pd.DataFrame({
    column: df_meetup_test[column] - 1
    for column in df_meetup_test.columns
}, columns=df_meetup_test.columns.tolist())
print(relabeled_meetup_test.shape)
relabeled_meetup_test.head()

(28438, 5)


Unnamed: 0,user,event,organizer,location,timeslot
0,692,286,269,1410,27
1,1510,286,269,1410,27
2,1639,286,269,1410,27
3,3581,286,269,1410,27
4,10721,77,494,6816,4


### Save as DeepCTR-compatible format

In [106]:
relabeled_meetup_train.to_csv(os.path.join(OUT_MEETUP_DIR, "deepctr.train.tsv"), sep='\t',
                              index=False)

In [108]:
relabeled_meetup_test.to_csv(os.path.join(OUT_MEETUP_DIR, "deepctr.test.tsv"), sep='\t',
                              index=False)

### Save as IFM format

### Save as CFM format

In [116]:
cfm_meetup_train = pd.DataFrame({
    "user": relabeled_meetup_train["user"],

    "event": relabeled_meetup_train["event"],
    "organizer": relabeled_meetup_train["organizer"] + cumulative_count(meetup_feature_counts, "event"),
    "location": relabeled_meetup_train["location"] + cumulative_count(meetup_feature_counts, "event", "organizer"),
    "timeslot": relabeled_meetup_train["timeslot"] + cumulative_count(meetup_feature_counts, "event", "organizer", "location"),
}, columns=df_meetup.columns)
print(cfm_meetup_train.shape)
cfm_meetup_train.head()

(113921, 5)


Unnamed: 0,user,event,organizer,location,timeslot
0,1469,13,41776,45777,50150
1,1844,13,41776,45777,50150
2,2370,13,41776,45777,50150
3,2424,13,41776,45777,50150
4,2947,13,41776,45777,50150


In [117]:
for column in cfm_meetup_train.columns:
    print("{0}: {1} items, numbered from {2} to {3}".format(column,
                                                            cfm_meetup_train[column].drop_duplicates().count(),
                                                            cfm_meetup_train[column].min(),
                                                            cfm_meetup_train[column].max()))

user: 11233 items, numbered from 0 to 11232
event: 34907 items, numbered from 0 to 41441
organizer: 1560 items, numbered from 41445 to 43080
location: 6046 items, numbered from 43128 to 49866
timeslot: 146 items, numbered from 50143 to 50310


In [118]:
cfm_meetup_train.to_csv(os.path.join(OUT_MEETUP_DIR, "cfm.train.tsv"), sep='\t',
                          index=False)

In [119]:
cfm_meetup_test = pd.DataFrame({
    "user": relabeled_meetup_test["user"],

    "event": relabeled_meetup_test["event"],
    "organizer": relabeled_meetup_test["organizer"] + cumulative_count(meetup_feature_counts, "event"),
    "location": relabeled_meetup_test["location"] + cumulative_count(meetup_feature_counts, "event", "organizer"),
    "timeslot": relabeled_meetup_test["timeslot"] + cumulative_count(meetup_feature_counts, "event", "organizer", "location"),
}, columns=df_meetup.columns)
print(cfm_meetup_test.shape)
cfm_meetup_test.head()

(28438, 5)


Unnamed: 0,user,event,organizer,location,timeslot
0,692,286,41714,44538,50170
1,1510,286,41714,44538,50170
2,1639,286,41714,44538,50170
3,3581,286,41714,44538,50170
4,10721,77,41939,49944,50147


In [120]:
for column in cfm_meetup_test.columns:
    print("{0}: {1} items, numbered from {2} to {3}".format(column,
                                                            cfm_meetup_test[column].drop_duplicates().count(),
                                                            cfm_meetup_test[column].min(),
                                                            cfm_meetup_test[column].max()))

user: 11233 items, numbered from 0 to 11232
event: 6538 items, numbered from 77 to 41444
organizer: 1232 items, numbered from 41445 to 43127
location: 2359 items, numbered from 43129 to 50142
timeslot: 139 items, numbered from 50143 to 50310


In [121]:
cfm_meetup_test.to_csv(os.path.join(OUT_MEETUP_DIR, "cfm.test.tsv"), sep='\t',
                          index=False)