In [None]:
import pandas as pd
import numpy as np

# Data Reading

In [None]:
events = pd.read_csv('../data/raw/data_events.csv')
retention = pd.read_csv('../data/raw/data_retention.csv')

In [None]:
print(events.head())
print('Length of the events dataset: ' + str(len(events)))
print('\n' + 100*'-' + '\n')
print(retention.head())
print('Length of the retention dataset: ' + str(len(retention)))

# Data types

In [None]:
print(events.dtypes)
print('\n' + 100*'-' + '\n')
print(retention.dtypes)

# Missingness

In [None]:
print(events.isna().sum())
print('\n' + 100*'-' + '\n')
print(retention.isna().sum())

# First look

In [None]:
print('# of unqiue users in the events dataset: ' + str(len(events.user_pseudo_id.unique())))
print('# of unqiue users in the retention dataset: ' + str(len(retention.user_pseudo_id.unique())))
print('# of unique events in the events dataset: ' + str(len(events.event_name.unique())))

## Initial insights

* We have a events table that keeps actions of the users and retention table that keeps retention scores of the users.
* In total we have 52 different types of events, 373296 unique users and 500000 user actions.
* Datasets have no missing values.
* Data types are correct.

Therefore, dataset will be transformed into training data which has users as rows, events as features, user actions (event_count) as values and retention score as float target variable.


# Data Wrangling

**Step 1**

user_event dataframe is created from the original events dataframe. user_event table has column for each unique event_name, index for each user_pseudo_id and values filled by event_count. While filling the values for each user, if event_name is not included in the events dataset for a user then event_count for that user-event pair is taken as zero. At the end, we would expect to have 373296 rows and 52 columns in the user_event table.

**Step 2**

user_event dataframe is inner joined with retention dataframe. Hence, retentionScore is added to dataset.

**Step 3**

Cheers! We have a tidy-clean dataset to start our data analysis.

In [None]:
user_event = events.pivot(index='user_pseudo_id', columns='event_name', values='event_count')
user_event.fillna(0, inplace=True)
user_event.shape

In [None]:
df = retention.set_index('user_pseudo_id').join(user_event)
print(df.shape)
print(df.columns.values)

In [None]:
df.to_csv('../data/training_sets/dataset_v001.csv')