In [None]:
#creating spark instance
from pyspark.sql import SparkSession
spark = SparkSession .builder.appName('removeduplicatescart').getOrCreate()

In [None]:
#import libraries
import pandas as pd
import numpy as np
from pyspark.sql import functions as sf

In [None]:
#Read column names
headers1 = spark.read.csv("sample1.csv",inferSchema =True, header=True)
#headers1.printSchema()

In [None]:
#Read data from tsv file
rows = spark.read.option("sep", "\t").csv("..\hit_data.tsv",inferSchema =True, header=False)
#rows.printSchema()

In [None]:
#adding header to data
rowsWithHeader1 = headers1.union(rows)
#rowsWithHeader1.printSchema()
rowsWithHeader1.count()

In [None]:
rowsWithHeader1 = rowsWithHeader1.filter(rowsWithHeader1.exclude_hit<=0)
rowsCount = rowsWithHeader1.count()
rowsCount

In [None]:
#filtering rows based on hit_source column value. Exclude all rows with hit_source = 5,7,8,9.
rowsWithHeader1 = rowsWithHeader1.filter(rowsWithHeader1.hit_source!=5)
rowsWithHeader1 = rowsWithHeader1.filter(rowsWithHeader1.hit_source!=7)
rowsWithHeader1 = rowsWithHeader1.filter(rowsWithHeader1.hit_source!=8)
rowsWithHeader1 = rowsWithHeader1.filter(rowsWithHeader1.hit_source!=9)

In [None]:
#Selecting only post columns
postData = rowsWithHeader1.select(*filter(lambda col: 'post_' in col,rowsWithHeader1.columns))
#Printing no. of columns
len(postData.columns)

In [None]:
#Add a new column which conains values to idenify unique visitors
postDataWithUniqueId = postData.withColumn('joined_column', sf.concat(sf.col('post_visid_high'),sf.lit(''), sf.col('post_visid_low')))

In [None]:
#apart from post_ columns considering column obtained after joining the two post_ columns as mentioned above and mcvisid
postDataWithUniqueId = postDataWithUniqueId.withColumnRenamed('joined_column', 'post_uniqueId')

In [None]:
#Select the column post_uniqueID
UniqueId=postDataWithUniqueId.select('post_uniqueId')
#Print count
UniqueId.count()

In [None]:
#Removing post_mobile columns from dataframe
postDataWithoutMobile = postDataWithUniqueId.drop(*filter(lambda col: 'mobile' in col,postData.columns))
#Printing no. of columns
len(postDataWithoutMobile.columns)

In [None]:
#select users with cart,ship,chkout4_confirm in referrer
cartShipConfirmRows = postDataWithoutMobile.filter(postDataWithoutMobile.post_referrer.rlike("^(.)*(cart|CART|Cart|shipping|Ship|ship|chkout4_confirm)(.)*$"))
cartShipConfirmRows.count()

In [None]:
#storing all existing distinct users
uniqueIdsWithCartShipConfirm = [list(x.asDict().values())[0] for x in cartShipConfirmRows.select("post_uniqueid").distinct().collect()]
print("{}".format(len(uniqueIdsWithCartShipConfirm)))

In [None]:
#Get all rows of users having cart, ship, chkout4_confirm in post_referrer
allRowsOfCartShipConfirmUsers = postDataWithoutMobile.where(postDataWithoutMobile.post_uniqueId.isin(uniqueIdsWithCartShipConfirm))
#Print count
allRowsOfCartShipConfirmUsersCount = allRowsOfCartShipConfirmUsers.count()
allRowsOfCartShipConfirmUsersCount

In [None]:
#Regex to find event list having 1 at beginning, in middle and at end.
event1CartShipConfirmRows = allRowsOfCartShipConfirmUsers.filter(allRowsOfCartShipConfirmUsers.post_event_list.rlike("^(1,[0-9 , . =]*|[0-9 , . =]*,1,[0-9 , . =]*|[0-9 , . =]*,1)$"))
#print count
event1CartShipConfirmRows.count()

In [None]:
#Get unique Ids
event1CartShipConfirmUniqueIds = [list(x.asDict().values())[0] for x in event1CartShipConfirmRows.select("post_uniqueid").distinct().collect()]
#print count
print("{}".format(len(event1CartShipConfirmUniqueIds)))

In [None]:
#Get abandoned userIds
abandonedUsersIds = list(set(uniqueIdsWithCartShipConfirm) - set(event1CartShipConfirmUniqueIds))
#print count
print("{}".format(len(abandonedUsersIds)))

In [None]:
#Reducing data by removing null columns
allRowsOfCartShipConfirmUsersPDF = allRowsOfCartShipConfirmUsers.toPandas()
allRowsOfCartShipConfirmUsersPDF.shape

In [None]:
#Removes any columns with null values
rowsWithoutNullColumns = allRowsOfCartShipConfirmUsersPDF.dropna(axis=1,how='all')
rowsWithoutNullColumns.shape

In [None]:
#Removing columns with 50% nulls 
rowsWithoutNullColumnsThresh50 = rowsWithoutNullColumns.dropna(axis=1,thresh=allRowsOfCartShipConfirmUsersCount/2)
rowsWithoutNullColumnsThresh50.shape

In [None]:
#Removing columns with 20% nulls 
rowsWithoutNullColumnsThresh20 = rowsWithoutNullColumns.dropna(axis=1,thresh=allRowsOfCartShipConfirmUsersCount*4/5)
rowsWithoutNullColumnsThresh20.shape

In [None]:
#Display dataframe
rowsWithoutNullColumnsThresh50.columns

In [None]:
#Display prop columns 
rowsWithoutNullColumnsThresh50.filter(regex=("post_prop.*"))

In [None]:
#Display prop14 column
rowsWithoutNullColumnsThresh50.post_prop14

In [None]:
#Retreiving columns with no nulls 
rowsWithoutNullColumns0 = rowsWithoutNullColumns.dropna(axis=1,thresh=allRowsOfCartShipConfirmUsersCount)
rowsWithoutNullColumns0.shape

In [None]:
#Columns with no nulls
rowsWithoutNullColumnsThresh0.columns

In [None]:
rowsWithoutNullColumnsThresh0

In [None]:
#Save dataframe to a csv file
rowsWithoutNullColumnsThresh50.to_csv("reducedusersrows.csv", index=False)

In [None]:
#Read column names
allUsers = spark.read.csv("reducedusersrows.csv",inferSchema =True, header=True)
#allUsers.printSchema()

In [None]:
#get all rows of abandoned users
allRowsOfAbandonedUsers = allUsers.where(allUsers.post_uniqueId.isin(abandonedUsersIds))
#print count
allRowsOfAbandonedUsers.count()

In [None]:
#Groupby abandoned users using post_uniqueId
abandonedUsersGroupBy = allRowsOfAbandonedUsers.groupBy("post_uniqueId").count()

In [None]:
abandonedUsersGroupBy.show()

In [None]:
#get all rows of purchased users
allRowsOfPurchasedUsers = allUsers.where(allUsers.post_uniqueId.isin(event1CartShipConfirmUniqueIds))
#print count
allRowsOfPurchasedUsers.count()

In [None]:
#Groupby purchased users using post_uniqueId
purchasedUsersGroupBy = allRowsOfPurchasedUsers.groupBy("post_uniqueId").count()

In [None]:
purchasedUsersGroupBy.show()

In [None]:
#Displaying column names
allRowsOfPurchasedUsers.schema.names

In [None]:
#Selecting post_prop14 and post_referrer columns
allRowsOfPurchasedUsers.select("post_prop14","post_referrer").show()

In [None]:
#Selecting evar34 and evar35 columns
allRowsOfPurchasedUsers.select("post_evar34","post_evar35").show()

In [None]:
#Displaying schema
allRowsOfPurchasedUsers.schema