In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import itertools
import warnings
warnings.filterwarnings('ignore')

In [2]:
# For Laptop

filename1 = '/home/jovyan/work/data/revolut/rev-devices.csv'
filename2 = '/home/jovyan/work/data/revolut/rev-notifications.csv'
filename3 = '/home/jovyan/work/data/revolut/rev-transactions.csv'
filename4 = '/home/jovyan/work/data/revolut/rev-users.csv'

rev_devices = pd.read_csv(filename1)
rev_notifications = pd.read_csv(filename2)
rev_transactions = pd.read_csv(filename3)
rev_users = pd.read_csv(filename4)


In [41]:
df_rev_devices = rev_devices.copy()
df_rev_notifications = rev_notifications.copy()
df_rev_transactions = rev_transactions.copy()
df_rev_users = rev_users.copy()

# DATA PREPROCESSING

In [4]:
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
            " columns that have missing values.")
    return mis_val_table_ren_columns

In [5]:
def unqiue_values_table(df):
    unq_val = df.nunique()
    unq_val_percent = 100 * df.nunique() / len(df)
    unq_val_table = pd.concat([unq_val, unq_val_percent], axis=1)
    unq_val_table_ren_columns = unq_val_table.rename(
    columns = {0 : 'Unique Values', 1 : '% of Total Values'})
    unq_val_table_ren_columns = unq_val_table_ren_columns[
        unq_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    'Unique Values', ascending=False).round(2)
    print ("Your selected dataframe has " + str(df.shape[0]) + " rows "       
        "and " + str(df.shape[1]) + " columns.")
    return unq_val_table_ren_columns

In [6]:
def date_conversion(df):
    df['created_date'] = pd.to_datetime(df['created_date']).dt.date
    return df

### USERS

In [42]:
missing_values_table(df_rev_users)

Your selected dataframe has 12 columns.
There are 2 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
attributes_notifications_marketing_push,6610,34.0
attributes_notifications_marketing_email,6610,34.0


In [43]:
unqiue_values_table(df_rev_users)

Your selected dataframe has 19430 rows and 12 columns.


Unnamed: 0,Unique Values,% of Total Values
user_id,19430,100.0
created_date,19430,100.0
city,6049,31.13
num_contacts,302,1.55
birth_year,69,0.36
country,41,0.21
plan,6,0.03
user_settings_crypto_unlocked,2,0.01
attributes_notifications_marketing_push,2,0.01
attributes_notifications_marketing_email,2,0.01


In [44]:
df_rev_users["attributes_notifications_marketing_push"].fillna("-1", inplace = True)
df_rev_users["attributes_notifications_marketing_email"].fillna("-1", inplace = True)
df_rev_users['created_date'] = pd.to_datetime(df_rev_users['created_date']).dt.date
df_rev_users.drop(columns=['num_referrals','num_successful_referrals'],axis=1,inplace=True)

In [45]:
df_rev_users

Unnamed: 0,user_id,birth_year,country,city,created_date,user_settings_crypto_unlocked,plan,attributes_notifications_marketing_push,attributes_notifications_marketing_email,num_contacts
0,user_0,1989,PL,Gdansk,2018-01-13,1,STANDARD,1.0,1.0,3
1,user_1,1975,GB,London,2018-01-29,0,STANDARD,-1,-1,21
2,user_2,1987,PL,Poznań,2018-01-18,0,STANDARD,0.0,0.0,21
3,user_3,1994,FR,Paris,2018-01-15,0,STANDARD,1.0,0.0,0
4,user_4,1985,GB,Beckenham,2018-01-11,0,STANDARD,-1,-1,2
...,...,...,...,...,...,...,...,...,...,...
19425,user_19425,1994,IE,Dublin,2018-12-14,0,STANDARD,1.0,1.0,17
19426,user_19426,1977,ES,Madrid,2018-12-28,0,STANDARD,1.0,1.0,0
19427,user_19427,1991,PL,Kraków,2018-12-23,0,STANDARD,1.0,1.0,13
19428,user_19428,1984,GB,London,2018-12-03,0,STANDARD,-1,-1,0


In [46]:
df_rev_users['created_date']= (pd.Timestamp.now().normalize() - pd.to_datetime(df_rev_users['created_date'], errors='coerce')).dt.days
df_rev_users['birth_year'] = pd.Timestamp.now().year - df_rev_users['birth_year']
df_rev_users

Unnamed: 0,user_id,birth_year,country,city,created_date,user_settings_crypto_unlocked,plan,attributes_notifications_marketing_push,attributes_notifications_marketing_email,num_contacts
0,user_0,32,PL,Gdansk,1181,1,STANDARD,1.0,1.0,3
1,user_1,46,GB,London,1165,0,STANDARD,-1,-1,21
2,user_2,34,PL,Poznań,1176,0,STANDARD,0.0,0.0,21
3,user_3,27,FR,Paris,1179,0,STANDARD,1.0,0.0,0
4,user_4,36,GB,Beckenham,1183,0,STANDARD,-1,-1,2
...,...,...,...,...,...,...,...,...,...,...
19425,user_19425,27,IE,Dublin,846,0,STANDARD,1.0,1.0,17
19426,user_19426,44,ES,Madrid,832,0,STANDARD,1.0,1.0,0
19427,user_19427,30,PL,Kraków,837,0,STANDARD,1.0,1.0,13
19428,user_19428,37,GB,London,857,0,STANDARD,-1,-1,0


In [35]:
pd.to_datetime(df['created_date']).dt.year - pd.Timestamp.now().year

0       -3
1       -3
2       -3
3       -3
4       -3
        ..
19425   -3
19426   -3
19427   -3
19428   -3
19429   -3
Name: created_date, Length: 19430, dtype: int64

###  Transactions

In [None]:
missing_values_table(df_rev_transactions)

In [None]:
df_rev_transactions['ea_cardholderpresence'].unique()
df_rev_transactions['ea_cardholderpresence'].fillna("UNKNOWN", inplace = True)

In [None]:
df_rev_transactions['ea_merchant_mcc'].fillna("-1111", inplace = True)
df_rev_transactions['ea_merchant_mcc'] = pd.to_numeric(df_rev_transactions['ea_merchant_mcc'])

In [None]:
df_rev_transactions['ea_merchant_country'].fillna("UNKNOWN", inplace = True)

In [None]:
df_rev_transactions.drop(columns=['ea_merchant_city','ea_merchant_mcc'], axis = 1,inplace = True)

In [None]:
unqiue_values_table(df_rev_transactions)

In [None]:
df_rev_transactions['created_date'] = pd.to_datetime(df_rev_transactions['created_date']).dt.date
#df_rev_transactions.set_index(['user_id','created_date'],inplace=True)
df_rev_transactions.head(5)

In [None]:
df_rev_transactions['transactions_state'].unique()

### DEVICES 

In [None]:
missing_values_table(df_rev_devices)

In [None]:
unqiue_values_table(df_rev_devices)

### NOTIFICATIONS 

In [None]:
missing_values_table(df_rev_notifications)

In [None]:
unqiue_values_table(df_rev_notifications)

In [None]:
date_conversion(df_rev_notifications)