In [12]:
from google.cloud import bigquery
from google.oauth2 import service_account

In [13]:
# Set up credentials
credentials = service_account.Credentials.from_service_account_file(
    '/Users/ttanaka/Downloads/predictive-behavior-analytics-b509bad93e58.json'
)

# Create a BigQuery client
client = bigquery.Client(credentials=credentials, project=credentials.project_id)

In [14]:
query = """
SELECT
  *
FROM
  `bigquery-public-data.google_analytics_sample.ga_sessions_20170801`
LIMIT 10000
"""

# Execute the query
df = client.query(query).to_dataframe()

# Display the first few rows of the DataFrame
df.head()


I0000 00:00:1723939694.695566 1460281 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


Unnamed: 0,visitorId,visitNumber,visitId,visitStartTime,date,totals,trafficSource,device,geoNetwork,customDimensions,hits,fullVisitorId,userId,clientId,channelGrouping,socialEngagementType
0,,1,1501591568,1501591568,20170801,"{'visits': 1, 'hits': 1, 'pageviews': 1, 'time...","{'referralPath': None, 'campaign': '(not set)'...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Europe', 'subContinent': 'South...",[],"[{'hitNumber': 1, 'time': 0, 'hour': 5, 'minut...",3418334011779872055,,,Organic Search,Not Socially Engaged
1,,2,1501589647,1501589647,20170801,"{'visits': 1, 'hits': 1, 'pageviews': 1, 'time...","{'referralPath': '/analytics/web/', 'campaign'...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Asia', 'subContinent': 'Souther...","[{'index': 4, 'value': 'APAC'}]","[{'hitNumber': 1, 'time': 0, 'hour': 5, 'minut...",2474397855041322408,,,Referral,Not Socially Engaged
2,,1,1501616621,1501616621,20170801,"{'visits': 1, 'hits': 1, 'pageviews': 1, 'time...","{'referralPath': '/analytics/web/', 'campaign'...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Europe', 'subContinent': 'North...","[{'index': 4, 'value': 'EMEA'}]","[{'hitNumber': 1, 'time': 0, 'hour': 12, 'minu...",5870462820713110108,,,Referral,Not Socially Engaged
3,,1,1501601200,1501601200,20170801,"{'visits': 1, 'hits': 1, 'pageviews': 1, 'time...","{'referralPath': '/analytics/web/', 'campaign'...","{'browser': 'Firefox', 'browserVersion': 'not ...","{'continent': 'Americas', 'subContinent': 'Nor...","[{'index': 4, 'value': 'North America'}]","[{'hitNumber': 1, 'time': 0, 'hour': 8, 'minut...",9397809171349480379,,,Referral,Not Socially Engaged
4,,1,1501615525,1501615525,20170801,"{'visits': 1, 'hits': 1, 'pageviews': 1, 'time...","{'referralPath': '/analytics/web/', 'campaign'...","{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Americas', 'subContinent': 'Nor...","[{'index': 4, 'value': 'North America'}]","[{'hitNumber': 1, 'time': 0, 'hour': 12, 'minu...",6089902943184578335,,,Referral,Not Socially Engaged


In [15]:
# Count the number of missing values in the visitorId column
missing_visitorId_count = df['visitorId'].isnull().sum()
print(f"Number of rows with missing visitorId: {missing_visitorId_count}")

Number of rows with missing visitorId: 2556


In [16]:
# Check the number of features (columns)
num_features = df.shape[1]

# Display the number of features
print(f"The dataset contains {num_features} features.")

The dataset contains 16 features.


In [17]:
# List all feature names
print("The features in the dataset are:")
print(df.columns.tolist())

The features in the dataset are:
['visitorId', 'visitNumber', 'visitId', 'visitStartTime', 'date', 'totals', 'trafficSource', 'device', 'geoNetwork', 'customDimensions', 'hits', 'fullVisitorId', 'userId', 'clientId', 'channelGrouping', 'socialEngagementType']


In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def flatten_nested_columns(df):
    nested_columns = ['totals', 'trafficSource', 'device', 'geoNetwork']
    flattened_dfs = []

    for col in nested_columns:
        flattened = pd.json_normalize(df[col])
        flattened.columns = [f'{col}_{subcol}' for subcol in flattened.columns]
        flattened_dfs.append(flattened)

    df_flattened = pd.concat([df.drop(nested_columns, axis=1)] + flattened_dfs, axis=1)
    
    return df_flattened

def extract_hit_level_data(df):
    # Flatten the main dataframe
    df_flat = pd.json_normalize(df.to_dict('records'))
    
    # Extract hit-level data
    hits_data = df_flat.apply(lambda row: pd.json_normalize(row['hits']), axis=1).explode().reset_index()
    
    # Rename 'index' to 'session_id' for clarity
    hits_data = hits_data.rename(columns={'index': 'session_id'})
    
    # Add session-level identifiers to the hit-level data
    hits_data['fullVisitorId'] = df_flat['fullVisitorId'].repeat(df_flat['hits'].str.len()).reset_index(drop=True)
    hits_data['visitId'] = df_flat['visitId'].repeat(df_flat['hits'].str.len()).reset_index(drop=True)
    hits_data['date'] = pd.to_datetime(df_flat['date'].repeat(df_flat['hits'].str.len()).reset_index(drop=True), format='%Y%m%d')
    
    # Select and rename relevant columns
    columns_to_keep = [
        'session_id', 'fullVisitorId', 'visitId', 'date',
        'hitNumber', 'time', 'hour', 'minute',
        'isEntrance', 'isExit', 'page.pagePath', 'page.pageTitle',
        'eventInfo.eventCategory', 'eventInfo.eventAction', 'eventInfo.eventLabel',
        'transaction.transactionId', 'transaction.transactionRevenue',
        'item.productName', 'item.productCategory', 'item.productSKU', 'item.itemRevenue'
    ]
    
    hits_df = hits_data[columns_to_keep].copy()
    
    # Convert numeric columns
    numeric_columns = ['time', 'hour', 'minute', 'transaction.transactionRevenue', 'item.itemRevenue']
    for col in numeric_columns:
        if col in hits_df.columns:
            hits_df[col] = pd.to_numeric(hits_df[col], errors='coerce')
    
    # Convert boolean columns
    boolean_columns = ['isEntrance', 'isExit']
    for col in boolean_columns:
        if col in hits_df.columns:
            hits_df[col] = hits_df[col].astype(bool)
    
    return hits_df

def clean_data(df):
    # Flatten nested columns
    df_cleaned = flatten_nested_columns(df)
    logger.info(f"Flattened DataFrame shape: {df_cleaned.shape}")

    # Drop the empty 'visitorId' column if it exists
    if 'visitorId' in df_cleaned.columns:
        df_cleaned = df_cleaned.drop('visitorId', axis=1)
        logger.info("Dropped empty 'visitorId' column")

    # Convert 'date' column to datetime
    df_cleaned['date'] = pd.to_datetime(df_cleaned['date'], format='%Y%m%d')

    # Extract country from geoNetwork
    df_cleaned['country'] = df_cleaned['geoNetwork_country']

    # Handle missing values and convert to numeric where possible
    numeric_columns = df_cleaned.select_dtypes(include=[np.number]).columns
    categorical_columns = df_cleaned.select_dtypes(exclude=[np.number, 'datetime64']).columns

    for col in df_cleaned.columns:
        if col in numeric_columns:
            df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
        elif col in categorical_columns:
            df_cleaned[col] = df_cleaned[col].astype(str)

    # Impute missing values
    numeric_imputer = SimpleImputer(strategy='median')
    df_cleaned[numeric_columns] = numeric_imputer.fit_transform(df_cleaned[numeric_columns])

    categorical_imputer = SimpleImputer(strategy='most_frequent')
    df_cleaned[categorical_columns] = categorical_imputer.fit_transform(df_cleaned[categorical_columns])

    # Feature scaling
    scaler = StandardScaler()
    df_cleaned[numeric_columns] = scaler.fit_transform(df_cleaned[numeric_columns])

    # Handle categorical variables
    le = LabelEncoder()
    for col in categorical_columns:
        df_cleaned[col] = le.fit_transform(df_cleaned[col])

    logger.info(f"Final cleaned session-level DataFrame shape: {df_cleaned.shape}")

    # Extract hit-level data
    hit_level_df = extract_hit_level_data(df)
    logger.info(f"Hit-level DataFrame shape: {hit_level_df.shape}")

    return df_cleaned, hit_level_df

# Usage
# df_cleaned, hit_level_df = clean_data(df)
# print(df_cleaned.head())
# print(hit_level_df.head())