In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import itertools
import warnings
warnings.filterwarnings('ignore')

In [2]:
# For Laptop

filename1 = '/home/jovyan/work/data/revolut/rev-devices.csv'
filename2 = '/home/jovyan/work/data/revolut/rev-notifications.csv'
filename3 = '/home/jovyan/work/data/revolut/rev-transactions.csv'
filename4 = '/home/jovyan/work/data/revolut/rev-users.csv'

rev_devices = pd.read_csv(filename1)
rev_notifications = pd.read_csv(filename2)
rev_transactions = pd.read_csv(filename3)
rev_users = pd.read_csv(filename4)


In [3]:
df_rev_devices = rev_devices.copy()
df_rev_notifications = rev_notifications.copy()
df_rev_transactions = rev_transactions.copy()
df_rev_users = rev_users.copy()

# DATA PREPROCESSING

In [4]:
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
            " columns that have missing values.")
    return mis_val_table_ren_columns

In [5]:
def unqiue_values_table(df):
    unq_val = df.nunique()
    unq_val_percent = 100 * df.nunique() / len(df)
    unq_val_table = pd.concat([unq_val, unq_val_percent], axis=1)
    unq_val_table_ren_columns = unq_val_table.rename(
    columns = {0 : 'Unique Values', 1 : '% of Total Values'})
    unq_val_table_ren_columns = unq_val_table_ren_columns[
        unq_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    'Unique Values', ascending=False).round(2)
    print ("Your selected dataframe has " + str(df.shape[0]) + " rows "       
        "and " + str(df.shape[1]) + " columns.")
    return unq_val_table_ren_columns

In [27]:
def date_conversion(df):
    df['created_date'] = pd.to_datetime(df['created_date']).dt.date
    return df

### USERS

In [6]:
missing_values_table(df_rev_users)

Your selected dataframe has 12 columns.
There are 2 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
attributes_notifications_marketing_push,6610,34.0
attributes_notifications_marketing_email,6610,34.0


In [7]:
unqiue_values_table(df_rev_users)

Your selected dataframe has 19430 rows and 12 columns.


Unnamed: 0,Unique Values,% of Total Values
user_id,19430,100.0
created_date,19430,100.0
city,6049,31.13
num_contacts,302,1.55
birth_year,69,0.36
country,41,0.21
plan,6,0.03
user_settings_crypto_unlocked,2,0.01
attributes_notifications_marketing_push,2,0.01
attributes_notifications_marketing_email,2,0.01


In [8]:
df_rev_users["attributes_notifications_marketing_push"].fillna("-1", inplace = True)
df_rev_users["attributes_notifications_marketing_email"].fillna("-1", inplace = True)
df_rev_users['created_date'] = pd.to_datetime(df_rev_users['created_date']).dt.date
df_rev_users.drop(columns=['num_referrals','num_successful_referrals'],axis=1,inplace=True)

In [9]:
df_rev_users

Unnamed: 0,user_id,birth_year,country,city,created_date,user_settings_crypto_unlocked,plan,attributes_notifications_marketing_push,attributes_notifications_marketing_email,num_contacts
0,user_0,1989,PL,Gdansk,2018-01-13,1,STANDARD,1.0,1.0,3
1,user_1,1975,GB,London,2018-01-29,0,STANDARD,-1,-1,21
2,user_2,1987,PL,Poznań,2018-01-18,0,STANDARD,0.0,0.0,21
3,user_3,1994,FR,Paris,2018-01-15,0,STANDARD,1.0,0.0,0
4,user_4,1985,GB,Beckenham,2018-01-11,0,STANDARD,-1,-1,2
...,...,...,...,...,...,...,...,...,...,...
19425,user_19425,1994,IE,Dublin,2018-12-14,0,STANDARD,1.0,1.0,17
19426,user_19426,1977,ES,Madrid,2018-12-28,0,STANDARD,1.0,1.0,0
19427,user_19427,1991,PL,Kraków,2018-12-23,0,STANDARD,1.0,1.0,13
19428,user_19428,1984,GB,London,2018-12-03,0,STANDARD,-1,-1,0


###  Transactions

In [10]:
missing_values_table(df_rev_transactions)

Your selected dataframe has 12 columns.
There are 4 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
ea_cardholderpresence,1170594,42.7
ea_merchant_country,1158979,42.3
ea_merchant_city,1158794,42.3
ea_merchant_mcc,1158658,42.3


In [11]:
df_rev_transactions['ea_cardholderpresence'].unique()
df_rev_transactions['ea_cardholderpresence'].fillna("UNKNOWN", inplace = True)

In [12]:
df_rev_transactions['ea_merchant_mcc'].fillna("-1111", inplace = True)
df_rev_transactions['ea_merchant_mcc'] = pd.to_numeric(df_rev_transactions['ea_merchant_mcc'])

In [13]:
df_rev_transactions['ea_merchant_country'].fillna("UNKNOWN", inplace = True)

In [14]:
df_rev_transactions.drop(columns=['ea_merchant_city','ea_merchant_mcc'], axis = 1,inplace = True)

In [15]:
unqiue_values_table(df_rev_transactions)

Your selected dataframe has 2740075 rows and 10 columns.


Unnamed: 0,Unique Values,% of Total Values
transaction_id,2740075,100.0
created_date,2739658,99.98
amount_usd,48548,1.77
user_id,18766,0.68
ea_merchant_country,199,0.01
transactions_currency,35,0.0
transactions_type,10,0.0
transactions_state,6,0.0
ea_cardholderpresence,3,0.0
direction,2,0.0


In [16]:
df_rev_transactions['created_date'] = pd.to_datetime(df_rev_transactions['created_date']).dt.date
#df_rev_transactions.set_index(['user_id','created_date'],inplace=True)
df_rev_transactions.head(5)

Unnamed: 0,transaction_id,transactions_type,transactions_currency,amount_usd,transactions_state,ea_cardholderpresence,ea_merchant_country,direction,user_id,created_date
0,transaction_0,TRANSFER,AED,4.55,COMPLETED,UNKNOWN,UNKNOWN,OUTBOUND,user_898,2018-04-03
1,transaction_1,CARD_PAYMENT,AED,15.5,COMPLETED,FALSE,ARE,OUTBOUND,user_1652,2019-03-19
2,transaction_2,CARD_PAYMENT,AED,43.4,COMPLETED,FALSE,ARE,OUTBOUND,user_1652,2019-03-18
3,transaction_3,TRANSFER,AED,10043.01,COMPLETED,UNKNOWN,UNKNOWN,OUTBOUND,user_1652,2019-03-22
4,transaction_4,CARD_PAYMENT,AED,43.81,COMPLETED,FALSE,ARE,OUTBOUND,user_5509,2019-03-22


In [17]:
df_rev_transactions['transactions_state'].unique()

array(['COMPLETED', 'REVERTED', 'DECLINED', 'PENDING', 'FAILED',
       'CANCELLED'], dtype=object)

### DEVICES 

In [18]:
missing_values_table(df_rev_devices)

Your selected dataframe has 2 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [19]:
unqiue_values_table(df_rev_devices)

Your selected dataframe has 19430 rows and 2 columns.


Unnamed: 0,Unique Values,% of Total Values
user_id,19430,100.0
brand,3,0.02


### NOTIFICATIONS 

In [20]:
missing_values_table(df_rev_notifications)

Your selected dataframe has 5 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [21]:
unqiue_values_table(df_rev_notifications)

Your selected dataframe has 121813 rows and 5 columns.


Unnamed: 0,Unique Values,% of Total Values
created_date,121810,100.0
user_id,18953,15.56
reason,17,0.01
channel,3,0.0
status,2,0.0


In [28]:
date_conversion(df_rev_notifications)

Unnamed: 0,reason,channel,status,user_id,created_date
0,REENGAGEMENT_ACTIVE_FUNDS,PUSH,SENT,user_7086,2018-12-02
1,REENGAGEMENT_ACTIVE_FUNDS,PUSH,SENT,user_6598,2018-12-01
2,REENGAGEMENT_ACTIVE_FUNDS,PUSH,SENT,user_4151,2018-12-04
3,REENGAGEMENT_ACTIVE_FUNDS,PUSH,SENT,user_1408,2018-12-11
4,REENGAGEMENT_ACTIVE_FUNDS,PUSH,SENT,user_6292,2018-12-14
...,...,...,...,...,...
121808,LOST_CARD_ORDER,EMAIL,SENT,user_12687,2019-05-07
121809,LOST_CARD_ORDER,EMAIL,SENT,user_16875,2019-05-10
121810,LOST_CARD_ORDER,EMAIL,SENT,user_17426,2019-05-09
121811,LOST_CARD_ORDER,EMAIL,SENT,user_15880,2019-05-10


In [None]:
https://stackoverflow.com/questions/37292872/how-can-i-one-hot-encode-in-python