# Introduction

This notebook is used to extract general information regarding the `processed_file.csv` for team discussions.

### Imports and Connect to Google Drive

In [1]:
import pandas as pd
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!ls -l drive/Shareddrives/

total 4
drwx------ 1 root root 4096 Apr  4 23:15 'SIADS - 694-695 Team Drive'


In [4]:
!head -10 '/content/drive/Shareddrives/SIADS - 694-695 Team Drive/processed_file.csv'

head: cannot open '/content/drive/Shareddrives/SIADS - 694-695 Team Drive/processed_file.csv' for reading: No such file or directory


### Load CSV into DataFrame



In [5]:
# df = pd.read_csv(
#   '/content/drive/Shareddrives/SIADS - 694-695 Team Drive/Sample Datasets/2016 - 2017 Google Analytics Dataset.csv',
#   nrows=1000,
#   # chunksize=10000
# )
df = pd.read_csv('/content/drive/Shareddrives/SIADS - 694-695 Team Drive/processed_file.csv', index_col=0)

FileNotFoundError: ignored

In [None]:
df.columns

Index(['socialEngagementType', 'hits', 'channelGrouping', 'trafficSource',
       'totals', 'date', 'device', 'fullVisitorId', 'geoNetwork',
       'totals.visits', 'totals.hits', 'totals.pageviews', 'totals.timeOnSite',
       'totals.transactions', 'totals.newVisits', 'totals.screenviews',
       'totals.uniqueScreenviews', 'totals.timeOnScreen',
       'totals.totalTransactionRevenue', 'hits.type', 'geoNetwork.country',
       'trafficSource.source', 'trafficSource.medium',
       'trafficSource.isTrueDirect', 'device.browser',
       'device.operatingSystem', 'device.deviceCategory', 'hits.hour',
       'hits.minute', 'hits.transaction', 'hits.item', 'hits.dataSource',
       'hits.appInfo', 'hits.eCommerceAction', 'totals.bounces',
       'hits.appInfo.screenDepth', 'hits.eCommerceAction.action_type',
       'hits.transaction.transactionRevenue', 'hits.item.productName',
       'hits.item.productCategory', 'hits.item.itemRevenue'],
      dtype='object')

### Get Column Overviews

In [None]:
def get_column_overview(
    df: pd.DataFrame,
    column_name: str,
    categorical: bool=False,
    print_unique_values: bool=False,
  ):
  """
  Prints out the column information of a given column name
  """

  sample_df_column = df[column_name]
  column_dtype = sample_df_column.dtype
  unique_count = len(sample_df_column.unique())
  nan_count = sample_df_column.isna().sum()

  min_value = max_value = mean_value = None
  if not categorical and column_dtype.kind in 'biufc': # Check if it's numeric
    min_value = sample_df_column.min()
    max_value = sample_df_column.max()
    mean_value = sample_df_column.mean()

  print(
    column_name,
    column_dtype,
    unique_count, 
    nan_count,
    min_value,
    max_value,
    mean_value
  )

  if print_unique_values:
    print("Unique Values:", sample_df_column.unique())



In [None]:
df.head()

Unnamed: 0,socialEngagementType,hits,channelGrouping,trafficSource,totals,date,device,fullVisitorId,geoNetwork,totals.visits,...,hits.dataSource,hits.appInfo,hits.eCommerceAction,totals.bounces,hits.appInfo.screenDepth,hits.eCommerceAction.action_type,hits.transaction.transactionRevenue,hits.item.productName,hits.item.productCategory,hits.item.itemRevenue
0,Not Socially Engaged,"[{'hitNumber': 1, 'time': 0, 'hour': 6, 'minut...",Social,"{'referralPath': '/', 'campaign': '(not set)',...","{'visits': 1, 'hits': 10, 'pageviews': 9, 'tim...",20160801,"{'browser': 'Chrome', 'browserVersion': 'not a...",5160807529212499245,"{'continent': 'Europe', 'subContinent': 'Easte...",1,...,"[None, None, None, None, None, None, None, Non...","[{'name': None, 'version': None, 'id': None, '...","[{'action_type': '0', 'step': 1, 'option': Non...",,0,0,,,,
1,Not Socially Engaged,"[{'hitNumber': 1, 'time': 0, 'hour': 22, 'minu...",Organic Search,"{'referralPath': None, 'campaign': '(not set)'...","{'visits': 1, 'hits': 2, 'pageviews': 2, 'time...",20160801,"{'browser': 'Chrome', 'browserVersion': 'not a...",558025075151655343,"{'continent': 'Americas', 'subContinent': 'Nor...",1,...,"[None, None]","[{'name': None, 'version': None, 'id': None, '...","[{'action_type': '0', 'step': 1, 'option': Non...",,0,0,,,,
2,Not Socially Engaged,"[{'hitNumber': 1, 'time': 0, 'hour': 13, 'minu...",Direct,"{'referralPath': None, 'campaign': '(not set)'...","{'visits': 1, 'hits': 2, 'pageviews': 2, 'time...",20160801,"{'browser': 'Chrome', 'browserVersion': 'not a...",3751823672052015547,"{'continent': 'Americas', 'subContinent': 'Nor...",1,...,"[None, None]","[{'name': None, 'version': None, 'id': None, '...","[{'action_type': '0', 'step': 1, 'option': Non...",,0,0,,,,
3,Not Socially Engaged,"[{'hitNumber': 1, 'time': 0, 'hour': 10, 'minu...",Direct,"{'referralPath': None, 'campaign': '(not set)'...","{'visits': 1, 'hits': 3, 'pageviews': 3, 'time...",20160801,"{'browser': 'Chrome', 'browserVersion': 'not a...",5231073281895209426,"{'continent': 'Americas', 'subContinent': 'Nor...",1,...,"[None, None, None]","[{'name': None, 'version': None, 'id': None, '...","[{'action_type': '0', 'step': 1, 'option': Non...",,0,0,,,,
4,Not Socially Engaged,"[{'hitNumber': 1, 'time': 0, 'hour': 8, 'minut...",Display,"{'referralPath': None, 'campaign': '(not set)'...","{'visits': 1, 'hits': 3, 'pageviews': 3, 'time...",20160801,"{'browser': 'Chrome', 'browserVersion': 'not a...",7157626079567895669,"{'continent': 'Americas', 'subContinent': 'Nor...",1,...,"[None, None, None]","[{'name': None, 'version': None, 'id': None, '...","[{'action_type': '0', 'step': 1, 'option': Non...",,0,0,,,,


In [None]:
print(
    "column_name"
    "dtype",
    "unique_count", 
    "nan_count",
    "min_value",
    "max_value",
    "mean_value"
)
for col in df.columns:
  get_column_overview(
    df,
    col,
  )

column_namedtype unique_count nan_count min_value max_value mean_value
socialEngagementType object 1 0 None None None
hits object 46082 0 None None None
channelGrouping object 7 0 None None None
trafficSource object 1677 0 None None None
totals object 11845 0 None None None
date int64 184 0 20160801 20170131 20162200.977467768
device object 81 0 None None None
fullVisitorId uint64 47867 0 45417921646651 9999978264901065827 4.531675207759643e+18
geoNetwork object 6954 0 None None None
totals.visits int64 1 0 1 1 1.0
totals.hits int64 149 0 1 483 4.688340717678521
totals.pageviews float64 116 5 1.0 340.0 3.8818786052174605
totals.timeOnSite float64 1965 25145 1.0 9963.0 245.34025079123433
totals.transactions float64 5 49519 1.0 5.0 1.0459965928449744
totals.newVisits float64 2 10351 1.0 1.0 1.0
totals.screenviews float64 1 50106 nan nan nan
totals.uniqueScreenviews float64 1 50106 nan nan nan
totals.timeOnScreen float64 1 50106 nan nan nan
totals.totalTransactionRevenue float64 504 49521

In [None]:
get_column_overview(
  df,
  "hits.eCommerceAction.action_type",
  print_unique_values=True,
)

hits.eCommerceAction.action_type int64 6 0 0 5 0.0060671376681435355
Unique Values: [0 2 3 1 4 5]


In [6]:
%cd "/content/drive/Shareddrives/SIADS - 694-695 Team Drive/python-files"
!ls -l

/content/drive/Shareddrives/SIADS - 694-695 Team Drive/python-files
total 45
-rw------- 1 root root  1804 May 11 20:24 clean_dataset.py
-rw------- 1 root root   721 May 11 21:43 create_sample_dataset.py
-rw------- 1 root root  3327 May 11 03:43 data_extraction.py
-rw------- 1 root root     0 Apr 23 23:30 data_preprocessing.py
-rw------- 1 root root   165 May  3 22:13 ml_algo.py
-rw------- 1 root root 21420 May 11 21:40 orchestrator.ipynb
drwx------ 2 root root  4096 May 11 04:15 __pycache__
-rw------- 1 root root  9557 May 11 21:44 transaction_extraction.py
-rw------- 1 root root  3219 May 11 17:34 utils.py


In [28]:
import os
import random
import pandas as pd
from utils import (append_sample, get_full_sample)

def create_sample_dataset(
    transactions_and_non_transactions_input_directory: str,
  ):
  """
  This function does the following:
  1. Reads all transactions CSV from the input directory and join as one DF.
  2. Reads all non-transactions CSV from the input directory and samples 10%
  3. Join both results from Step 1 and 2 into one DF
  """
  transactions_directory = f"{transactions_and_non_transactions_input_directory}transactions/"
  transactions_monthly_files = os.listdir(transactions_directory)

  non_transactions_directory = f"{transactions_and_non_transactions_input_directory}non_transactions/"
  non_transactions_monthly_files = os.listdir(non_transactions_directory)

  # Transactions
  transactions_df = pd.DataFrame({})
  for monthly_filename in transactions_monthly_files:
    transaction_file_name = f"{transactions_directory}{monthly_filename}"
    df = pd.read_csv(transaction_file_name)
    transactions_df = pd.concat([transactions_df, df], axis=0)

  outdir = f"../datasets/test_folder/transactions"
  if not os.path.exists(outdir):
    os.mkdir(outdir)

  transactions_df.to_csv(f"{outdir}/all_transactions.csv")

  # Non-transactions
  non_transaction_files = [
    f"{non_transactions_directory}{monthly_filename}"
    for monthly_filename in non_transactions_monthly_files
  ]
  print(non_transaction_files)
  non_transactions_df = get_full_sample(non_transaction_files)

  return transactions_df, non_transactions_df


transactions_input_directory = \
  "../datasets/monthly_partitioned_data_transactions/"

transactions_df, non_transactions_df = create_sample_dataset(transactions_input_directory)



['../datasets/monthly_partitioned_data_transactions/non_transactions/non_transactions_from_January 2017 Google Analytics Dataset.csv', '../datasets/monthly_partitioned_data_transactions/non_transactions/non_transactions_from_August 2016 Google Analytics Dataset.csv', '../datasets/monthly_partitioned_data_transactions/non_transactions/non_transactions_from_September 2016 Google Analytics Dataset.csv', '../datasets/monthly_partitioned_data_transactions/non_transactions/non_transactions_from_October 2016 Google Analytics Dataset.csv', '../datasets/monthly_partitioned_data_transactions/non_transactions/non_transactions_from_November 2016 Google Analytics Dataset.csv', '../datasets/monthly_partitioned_data_transactions/non_transactions/non_transactions_from_December 2016 Google Analytics Dataset.csv']


In [32]:
non_transactions_df

Unnamed: 0.1,Unnamed: 0,visitorId,visitNumber,visitId,visitStartTime,date,totals,trafficSource,device,geoNetwork,...,hits.item,hits.dataSource,hits.transaction.transactionRevenue,hits.item.productName,hits.item.productCategory,hits.item.itemRevenue,hits.appInfo,hits.appInfo.screenDepth,hits.eCommerceAction,hits.eCommerceAction.action_type
0,3,,1,1483304893,1483304893,20170101,"{'visits': 1, 'hits': 1, 'pageviews': 1, 'time...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Europe', 'subContinent': 'North...",...,[None],[None],,,,,"[{'name': None, 'version': None, 'id': None, '...",0,"[{'action_type': '0', 'step': 1, 'option': None}]",0
1,7,,2,1483310202,1483310202,20170101,"{'visits': 1, 'hits': 1, 'pageviews': 1, 'time...","{'referralPath': '/google-merchandise-store', ...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Europe', 'subContinent': 'South...",...,[None],[None],,,,,"[{'name': None, 'version': None, 'id': None, '...",0,"[{'action_type': '0', 'step': 1, 'option': None}]",0
2,38,,1,1483273712,1483273712,20170101,"{'visits': 1, 'hits': 1, 'pageviews': 1, 'time...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Europe', 'subContinent': 'South...",...,[None],[None],,,,,"[{'name': None, 'version': None, 'id': None, '...",0,"[{'action_type': '0', 'step': 1, 'option': None}]",0
3,43,,1,1483297500,1483297500,20170101,"{'visits': 1, 'hits': 1, 'pageviews': 1, 'time...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Europe', 'subContinent': 'Weste...",...,[None],[None],,,,,"[{'name': None, 'version': None, 'id': None, '...",0,"[{'action_type': '0', 'step': 1, 'option': None}]",0
4,50,,1,1483284315,1483284315,20170101,"{'visits': 1, 'hits': 1, 'pageviews': 1, 'time...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Safari', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Nor...",...,[None],[None],,,,,"[{'name': None, 'version': None, 'id': None, '...",0,"[{'action_type': '0', 'step': 1, 'option': None}]",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7767,79086,,1,1483236881,1483236881,20161231,"{'visits': 1, 'hits': 3, 'pageviews': 3, 'time...","{'referralPath': '/yt/about/', 'campaign': '(n...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Nor...",...,"[{'transactionId': None, 'productName': None, ...","[None, None, None]",,,,,"[{'name': None, 'version': None, 'id': None, '...",0,"[{'action_type': '0', 'step': 1, 'option': Non...",0
7768,79094,,1,1483240140,1483240140,20161231,"{'visits': 1, 'hits': 4, 'pageviews': 4, 'time...","{'referralPath': '/yt/about/', 'campaign': '(n...","{'browser': 'Firefox', 'browserVersion': 'not ...","{'continent': 'Americas', 'subContinent': 'Nor...",...,"[None, {'transactionId': None, 'productName': ...","[None, None, None, None]",,,,,"[{'name': None, 'version': None, 'id': None, '...",0,"[{'action_type': '0', 'step': 1, 'option': Non...",0
7769,79103,,1,1483219641,1483219641,20161231,"{'visits': 1, 'hits': 6, 'pageviews': 6, 'time...","{'referralPath': '/yt/about/fr/', 'campaign': ...","{'browser': 'Opera Mini', 'browserVersion': 'n...","{'continent': 'Africa', 'subContinent': 'Weste...",...,"[None, None, None, None, None, None]","[None, None, None, None, None, None]",,,,,"[{'name': None, 'version': None, 'id': None, '...",0,"[{'action_type': '0', 'step': 1, 'option': Non...",0
7770,79108,,1,1483180480,1483180480,20161231,"{'visits': 1, 'hits': 7, 'pageviews': 7, 'time...","{'referralPath': '/yt/about/', 'campaign': '(n...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Asia', 'subContinent': 'Souther...",...,"[None, {'transactionId': None, 'productName': ...","[None, None, None, None, None, None, None]",,,,,"[{'name': None, 'version': None, 'id': None, '...",0,"[{'action_type': '0', 'step': 1, 'option': Non...",0


In [31]:
transactions_df

Unnamed: 0.1,Unnamed: 0,visitorId,visitNumber,visitId,visitStartTime,date,totals,trafficSource,device,geoNetwork,...,hits.item,hits.dataSource,hits.transaction.transactionRevenue,hits.item.productName,hits.item.productCategory,hits.item.itemRevenue,hits.appInfo,hits.appInfo.screenDepth,hits.eCommerceAction,hits.eCommerceAction.action_type
0,1024,,1,1483274750,1483274750,20170101,"{'visits': 1, 'hits': 20, 'pageviews': 17, 'ti...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Safari', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Nor...",...,"[None, {'transactionId': None, 'productName': ...","[None, None, None, None, None, None, None, Non...",,,,,"[{'name': None, 'version': None, 'id': None, '...",0,"[{'action_type': '0', 'step': 1, 'option': Non...",0
1,1030,,2,1483325774,1483325774,20170101,"{'visits': 1, 'hits': 21, 'pageviews': 15, 'ti...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Safari', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Nor...",...,"[{'transactionId': None, 'productName': None, ...","[None, None, None, None, None, None, None, Non...",,,,,"[{'name': None, 'version': None, 'id': None, '...",0,"[{'action_type': '0', 'step': 1, 'option': Non...",0
2,1044,,1,1483325386,1483325386,20170101,"{'visits': 1, 'hits': 25, 'pageviews': 25, 'ti...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Nor...",...,"[{'transactionId': None, 'productName': None, ...","[None, None, None, None, None, None, None, Non...",,,,,"[{'name': None, 'version': None, 'id': None, '...",0,"[{'action_type': '0', 'step': 1, 'option': Non...",0
3,1047,,1,1483317147,1483317147,20170101,"{'visits': 1, 'hits': 25, 'pageviews': 23, 'ti...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Nor...",...,"[{'transactionId': None, 'productName': None, ...","[None, None, None, None, None, None, None, Non...",,,,,"[{'name': None, 'version': None, 'id': None, '...",0,"[{'action_type': '0', 'step': 1, 'option': Non...",0
4,1073,,5,1483295231,1483295231,20170101,"{'visits': 1, 'hits': 52, 'pageviews': 46, 'ti...","{'referralPath': '/', 'campaign': '(not set)',...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Nor...",...,"[None, None, {'transactionId': None, 'productN...","[None, None, None, None, None, None, None, Non...",,,,,"[{'name': None, 'version': None, 'id': None, '...",0,"[{'action_type': '0', 'step': 1, 'option': Non...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1390,78826,,1,1483240868,1483240868,20161231,"{'visits': 1, 'hits': 34, 'pageviews': 27, 'ti...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Nor...",...,"[None, None, {'transactionId': None, 'productN...","[None, None, None, None, None, None, None, Non...",,,,,"[{'name': None, 'version': None, 'id': None, '...",0,"[{'action_type': '0', 'step': 1, 'option': Non...",0
1391,78828,,4,1483198871,1483198871,20161231,"{'visits': 1, 'hits': 36, 'pageviews': 32, 'ti...","{'referralPath': '/', 'campaign': '(not set)',...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Nor...",...,"[None, {'transactionId': None, 'productName': ...","[None, None, None, None, None, None, None, Non...",,,,,"[{'name': None, 'version': None, 'id': None, '...",0,"[{'action_type': '0', 'step': 1, 'option': Non...",0
1392,78833,,1,1483251091,1483251091,20161231,"{'visits': 1, 'hits': 39, 'pageviews': 36, 'ti...","{'referralPath': '/', 'campaign': '(not set)',...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Nor...",...,"[None, {'transactionId': None, 'productName': ...","[None, None, None, None, None, None, None, Non...",,,,,"[{'name': None, 'version': None, 'id': None, '...",0,"[{'action_type': '0', 'step': 1, 'option': Non...",0
1393,78834,,1,1483208352,1483208352,20161231,"{'visits': 1, 'hits': 39, 'pageviews': 30, 'ti...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Nor...",...,"[{'transactionId': None, 'productName': None, ...","[None, None, None, None, None, None, None, Non...",,,,,"[{'name': None, 'version': None, 'id': None, '...",0,"[{'action_type': '0', 'step': 1, 'option': Non...",0


In [33]:
combined_df = pd.concat([non_transactions_df, transactions_df])

In [35]:
combined_df.to_csv("../datasets/sample_dataset.csv")

In [8]:
!wc -l "/content/drive/Shareddrives/SIADS - 694-695 Team Drive/datasets/monthly_partitioned_data_transactions/transactions/transactions_from_August 2016 Google Analytics Dataset.csv"

1120 /content/drive/Shareddrives/SIADS - 694-695 Team Drive/datasets/monthly_partitioned_data_transactions/transactions/transactions_from_August 2016 Google Analytics Dataset.csv


In [9]:
!wc -l "/content/drive/Shareddrives/SIADS - 694-695 Team Drive/datasets/monthly_partitioned_data_transactions/transactions/transactions_from_December 2016 Google Analytics Dataset.csv"

1396 /content/drive/Shareddrives/SIADS - 694-695 Team Drive/datasets/monthly_partitioned_data_transactions/transactions/transactions_from_December 2016 Google Analytics Dataset.csv


In [11]:
!wc -l "/content/drive/Shareddrives/SIADS - 694-695 Team Drive/datasets/monthly_partitioned_data_transactions/transactions/transactions_from_November 2016 Google Analytics Dataset.csv"

920 /content/drive/Shareddrives/SIADS - 694-695 Team Drive/datasets/monthly_partitioned_data_transactions/transactions/transactions_from_November 2016 Google Analytics Dataset.csv


In [12]:
!wc -l "/content/drive/Shareddrives/SIADS - 694-695 Team Drive/datasets/monthly_partitioned_data_transactions/transactions/transactions_from_October 2016 Google Analytics Dataset.csv"

873 /content/drive/Shareddrives/SIADS - 694-695 Team Drive/datasets/monthly_partitioned_data_transactions/transactions/transactions_from_October 2016 Google Analytics Dataset.csv


In [13]:
!wc -l "/content/drive/Shareddrives/SIADS - 694-695 Team Drive/datasets/monthly_partitioned_data_transactions/transactions/transactions_from_September 2016 Google Analytics Dataset.csv"

860 /content/drive/Shareddrives/SIADS - 694-695 Team Drive/datasets/monthly_partitioned_data_transactions/transactions/transactions_from_September 2016 Google Analytics Dataset.csv


In [14]:
!wc -l "/content/drive/Shareddrives/SIADS - 694-695 Team Drive/datasets/monthly_partitioned_data_transactions/transactions/transactions_from_January 2017 Google Analytics Dataset.csv"

698 /content/drive/Shareddrives/SIADS - 694-695 Team Drive/datasets/monthly_partitioned_data_transactions/transactions/transactions_from_January 2017 Google Analytics Dataset.csv
