In [1]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
# basic system packages
import sys
import os

In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

### Load Data

In [11]:
# helper functions
def load_transaction():
    # load the whole transaction dataset
    transaction_path = os.path.join("data", "train_transaction.csv")
    train_transaction = pd.read_csv(transaction_path)
    return train_transaction

def load_transaction_nonv():
    # load the transaction dataset except v-columns
    transaction = load_transaction()
    nonv = transaction.loc[:, ~transaction.columns.str.contains("V")]
    del transaction # release memory
    return nonv

def load_transaction_v():
    # only load the v data in transaction
    transaction = load_transaction()
    v = transaction.loc[:, transaction.columns.str.contains("V")]
    del transaction
    return v

# Exploratory Data Analysis

## Missing Value Analysis

In [25]:
def missing_value_summary(df, k=0.5):
    # give a summary on the missing columns which missing percentage >= k
    for col in df: # col stands for column name
        missing_perc = np.average(df[col].isnull())
        if missing_perc >= k:
            print(col, "missing percentage is", missing_perc)

In [28]:
def missing_value_list(df, k=0.5):
    # return a true false list based on missing percentage and threshold k
    lst = [True if np.average(df[col].isnull()) >= k else False for col in df]
    return lst

In [12]:
v_raw = load_transaction_v()

In [26]:
missing_value_summary(v_raw)

V138 missing percentage is 0.8612371727571375
V139 missing percentage is 0.8612371727571375
V140 missing percentage is 0.8612371727571375
V141 missing percentage is 0.8612371727571375
V142 missing percentage is 0.8612371727571375
V143 missing percentage is 0.8612270125647712
V144 missing percentage is 0.8612270125647712
V145 missing percentage is 0.8612270125647712
V146 missing percentage is 0.8612371727571375
V147 missing percentage is 0.8612371727571375
V148 missing percentage is 0.8612371727571375
V149 missing percentage is 0.8612371727571375
V150 missing percentage is 0.8612270125647712
V151 missing percentage is 0.8612270125647712
V152 missing percentage is 0.8612270125647712
V153 missing percentage is 0.8612371727571375
V154 missing percentage is 0.8612371727571375
V155 missing percentage is 0.8612371727571375
V156 missing percentage is 0.8612371727571375
V157 missing percentage is 0.8612371727571375
V158 missing percentage is 0.8612371727571375
V159 missing percentage is 0.86122

In [31]:
v_raw.loc[:, missing_value_list(v_raw)]

Unnamed: 0,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,0.0,0.0,0.0,0.0,0.0,6.0,18.0,140.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,,,,,,,,,,,...,,,,,,,,,,
590536,,,,,,,,,,,...,,,,,,,,,,
590537,,,,,,,,,,,...,,,,,,,,,,
590538,,,,,,,,,,,...,,,,,,,,,,
