In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from utils.common_transformers import NullPct, OutlierHandler, IsNull, DTypeTransformer

from sklearn.feature_selection import VarianceThreshold

import altair as alt
from altair import Chart,X,Y

alt.data_transformers.disable_max_rows()

from sklearn.base import TransformerMixin,BaseEstimator

# Remove Duplicates

In [2]:
df = pd.read_csv("../data/Train.csv")

In [3]:
duplicated = df[df.duplicated()]
df.drop(duplicated.index,inplace=True)

display(duplicated)

Unnamed: 0,session_id,session_number,client_agent,device_details,date,purchased,added_in_cart,checked_out,time_spent


# Outliers

In [4]:
df.head()

Unnamed: 0,session_id,session_number,client_agent,device_details,date,purchased,added_in_cart,checked_out,time_spent
0,57f879e70d3c5fc2a98102d64c9fd84e,715,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,Desktop - Chrome,2020-01-22,1,0,0,236.886
1,a5442b0c7c33d0a811e7661e556b2de8,55,Product/8.0 iPhone/8.1.3,iPhone - iOS,2020-02-27,1,0,0,1277.455
2,305cb1486ed8610c00b37007926cb2c4,11,Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like ...,iPhone - MobileWeb,2019-08-01,0,0,0,21.704
3,f2c1ecc9993f0071df91ba178450498c,2794,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,Desktop - IE,2019-12-30,0,1,0,455.201
4,e460830ae295e55d2216ebdc761ab9a6,3674,Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_1 like ...,iPhone - Web,2019-09-10,0,0,0,1164.877


In [5]:
df.dtypes

session_id         object
session_number      int64
client_agent       object
device_details     object
date               object
purchased           int64
added_in_cart       int64
checked_out         int64
time_spent        float64
dtype: object

In [6]:
df = DTypeTransformer({"purchased":"bool","added_in_cart":"bool","checked_out":"bool","date":"dt_%Y-%m-%d"}).fit_transform(df)

In [7]:
df = OutlierHandler(exclude=["session_number","session_id"]).fit_transform(df)
df.tail()



Unnamed: 0,session_id,session_number,client_agent,device_details,date,purchased,added_in_cart,checked_out,time_spent,time_spent_is_outlier
5424,b6aa30da97fa3c95989c47ccf5ab4e9f,627,Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Ma...,iPhone - Web,2019-11-20,False,False,False,20.601,False
5425,58b62e9fa867d622662b8a4096fdc3f0,6710,Product/4.2.2 iPhone/7.1.1,iPhone - iOS,2019-09-26,False,False,False,1537.666,True
5426,109f7f9b8384e46fc88f580b328298e8,198,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4)...,Desktop - Chrome,2019-08-13,False,False,False,22.711,False
5427,ac6579972e89dfc78d3ee95d15764681,913,,Unknown - MobileWeb,2019-06-18,False,False,False,20.844,False
5428,bd8f8239b827fc29d4e1bec91bd499b7,1254,Mozilla/5.0 (iPhone; CPU iPhone OS 7_0_4 like ...,iPhone - Web,2019-10-21,False,False,False,20.504,False


# Null Analysis

In [8]:
print("Columns")
pd.DataFrame(df.isnull().mean() * 100,columns=["NullPerc"]).query("NullPerc>0")

Columns


Unnamed: 0,NullPerc
client_agent,2.947136


In [9]:
df = NullPct().fit_transform(df)

In [10]:
Chart(df).mark_bar().encode(
    Y("null_pct:N"),
    X("count(null_pct):Q")
)

In [11]:
df = IsNull(exclude=["ID"]).fit_transform(df)

In [12]:
num_data = df.select_dtypes(include=[int,float,bool])
var_thresh = VarianceThreshold()
var_thresh.fit(df.drop(str_cols,axis=1))
cols_removed = [j for i,j in enumerate(list(num_data.columns)) if not var_thresh.get_support()[i]]

print("Cols Removed: ", cols_removed)

df.drop(num_data.columns,axis=1,inplace=True)
df = pd.concat([df,num_data.iloc[:,var_thresh.get_support()]],axis=1)
df.head()

NameError: name 'str_cols' is not defined

In [13]:
null_cols = pd.DataFrame(df.isnull().sum(),columns=["Nulls"]).query("Nulls>0").index.to_list()
print("Rows where col value is null")

for col in null_cols:
    print("Column: " + col)
    display(df[df[col].isnull()])

Rows where col value is null


In [14]:
print("Rows With More than a single null value in a row")
df.loc[df.isnull().sum(axis=1)>=2,:]

Rows With More than a single null value in a row


Unnamed: 0,null_pct,time_spent_is_outlier,session_id_is_null,session_number_is_null,client_agent_is_null,device_details_is_null,date_is_null,purchased_is_null,added_in_cart_is_null,checked_out_is_null,time_spent_is_null


In [15]:
df.columns

Index(['null_pct', 'time_spent_is_outlier', 'session_id_is_null',
       'session_number_is_null', 'client_agent_is_null',
       'device_details_is_null', 'date_is_null', 'purchased_is_null',
       'added_in_cart_is_null', 'checked_out_is_null', 'time_spent_is_null'],
      dtype='object')

In [16]:
df["null_pct"].value_counts()

0.00     5269
11.11     160
Name: null_pct, dtype: int64

In [17]:
df.drop("null_pct",axis=1,inplace=True)

## Target Column Outlier

In [18]:
df["time_spent_is_outlier"].value_counts(normalize=True)*100

False    87.658869
True     12.341131
Name: time_spent_is_outlier, dtype: float64

In [19]:
df.query("time_spent_is_outlier==True").sample(10)

Unnamed: 0,time_spent_is_outlier,session_id_is_null,session_number_is_null,client_agent_is_null,device_details_is_null,date_is_null,purchased_is_null,added_in_cart_is_null,checked_out_is_null,time_spent_is_null
685,True,False,False,False,False,False,False,False,False,False
1774,True,False,False,False,False,False,False,False,False,False
3803,True,False,False,False,False,False,False,False,False,False
5051,True,False,False,False,False,False,False,False,False,False
3265,True,False,False,False,False,False,False,False,False,False
5129,True,False,False,False,False,False,False,False,False,False
3123,True,False,False,False,False,False,False,False,False,False
4962,True,False,False,False,False,False,False,False,False,False
747,True,False,False,False,False,False,False,False,False,False
3212,True,False,False,False,False,False,False,False,False,False


In [20]:
df.dtypes

time_spent_is_outlier     bool
session_id_is_null        bool
session_number_is_null    bool
client_agent_is_null      bool
device_details_is_null    bool
date_is_null              bool
purchased_is_null         bool
added_in_cart_is_null     bool
checked_out_is_null       bool
time_spent_is_null        bool
dtype: object

In [21]:
df.drop(["session_id","session_number"],axis=1,inplace=True)

KeyError: "['session_id' 'session_number'] not found in axis"

In [22]:
mapping = {"purchased":"bool","added_in_cart":"bool","checked_out":"bool","date":"dt_%Y-%m-%d","time_spent_is_outlier":"bool","client_agent_is_null":"bool"}

In [23]:
import json
with open('../intermediate_data/dtypes.json', 'w') as f:
    json.dump(mapping, f)

In [24]:
df.to_json("../intermediate_data/null_outlier_handled.json")