In [1]:
from functools import partial
import json
import os

import numpy as np
import pandas as pd

%matplotlib inline

# Data Transformation

Here, we transform the raw data into the observations required by the model. 

## Input Data

We expect 2 data files to be available, `recommendations.csv` and `item_metadata_w_tags.csv`.
Examples of these files can be found in the `data` folder.

* `recommendations.csv`: 
    * timestamp (float): When the recommendation was made
    * user (int) - User identifier
    * item (int) - Item identifier

* `item_metadata_w_tags.csv`:
    * item (int)
    * text (str)
    * tag (str/int)

## Output Data

* `samples_for_model.csv` - Observations 
    * user (int) - User identifier
    * week_index (int) - Week in the study period
    * count (int) - Number of items recommended to the user in this week
    * weeks_since_signup (int) - Number of weeks since the user's first interaction
    * variety (int) - Variety in topics or political

In [2]:
base_folder = "Result"
result_folder = "Result"

In [4]:
recommendations_df = pd.read_csv(f"{base_folder}/recommendations.csv")
item_metadata_df = pd.read_csv(f"{base_folder}/item_metadata_w_tags.csv")

In [None]:
print

In [None]:
# Get datetime from epoch
recommendations_df['datetime'] = pd.to_datetime(recommendations_df["timestamp"], unit="s")

In [None]:
# Calculate user information
user_df = pd.to_datetime(recommendations_df.groupby('user').datetime.min().rename('signup_date').dt.date).reset_index()

In [None]:
user_df.head()

In [None]:
user_df.to_csv(f'{base_folder}/user_information.csv', index=True, header=True)

# Transform Dataset

In [None]:
# Add user info to recommendations
augmented_reco_df = pd.merge(recommendations_df, user_df, how="left", on="user", validate="many_to_one")

In [None]:
# Add date
augmented_reco_df["date"] = pd.to_datetime(augmented_reco_df["datetime"].dt.date)


In [None]:
augmented_reco_df.head()

In [None]:
# Change signup_date and date to first day of that week, so that we can easily calculate the weeks_since_signup
augmented_reco_df['date'] = augmented_reco_df['date'] - augmented_reco_df['date'].dt.weekday * np.timedelta64(1, 'D')
augmented_reco_df['signup_date'] = augmented_reco_df['signup_date'] - augmented_reco_df['signup_date'].dt.weekday * np.timedelta64(1, 'D')

In [None]:
augmented_reco_df.head()

In [None]:
# Assign index to unique weeks in the dataset

min_date = augmented_reco_df["date"].min()
max_date = augmented_reco_df["date"].max()

min_week = min_date.isocalendar().week
min_year = min_date.isocalendar().year
max_week = max_date.isocalendar().week
max_year = max_date.isocalendar().year
n_weeks = (max_year - min_year) * 52 + (max_week - min_week) + 1
min_year, min_week, max_year, max_week, n_weeks

week_index_map = {(min_week - 1 + i) % 52 + 1: i for i in range(n_weeks)}
# min_week, max_week, week_index_map


In [None]:
def assign_week_index(x):
    try:
        return week_index_map[x]
    except:
        print(x)

augmented_reco_df["week_index"] = augmented_reco_df["date"].dt.isocalendar().week.map(assign_week_index)

In [None]:
# Add weeks since user signed up
augmented_reco_df["weeks_since_signup"] = ((augmented_reco_df["date"] - augmented_reco_df["signup_date"]).dt.days / 7).astype(int)

In [None]:
# Plot how many recommendations were made in each week
augmented_reco_df["date"].dt.isocalendar().week.hist()

In [None]:
# How many recommendations made in each week since signup
augmented_reco_df["weeks_since_signup"].plot.hist()

In [None]:
# Frequency of every week index in the dataset
augmented_reco_df["week_index"].plot.hist()

In [None]:
augmented_reco_df.head()

In [None]:
# Write intermediate data to folder for reuse
augmented_reco_df.to_csv(f"{base_folder}/augmented_reco_df.csv", index=False)

In [None]:
augmented_reco_df = pd.merge(augmented_reco_df, item_metadata_df, on="item")

In [None]:
augmented_reco_df

# Group Into Observations

In [None]:
recommendations_grouped_week = augmented_reco_df.groupby(
    ["user", "week_index", "weeks_since_signup"]).agg({"item": lambda x: len(set(x)), "tag": lambda y: len(set(y))}).reset_index().rename(columns={"item": "count", "tag": "variety"})

In [None]:
recommendations_grouped_week.to_csv(
    f"{base_folder}/samples_for_model.csv", 
    index=False,
    header=True,
)
