In [None]:
# Import necessary libraries
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd
import numpy as np
import logging
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set up credentials
credentials = service_account.Credentials.from_service_account_file(
    '/Users/ttanaka/Downloads/predictive-behavior-analytics-b509bad93e58.json'
)

# Set the correct project ID
project_id = "predictive-behavior-analytics"

# Log the project ID being used
logger.info(f"Using project ID: {project_id}")

# Create a BigQuery client
client = bigquery.Client(credentials=credentials, project=project_id)

# BigQuery query with selected features relevant for Sales Prediction
query = """
SELECT
  CONCAT(fullVisitorId, CAST(visitId AS STRING)) AS session_id,
  date,
  totals.timeOnSite,
  totals.pageviews,
  totals.transactions,
  totals.transactionRevenue,
  trafficSource.source,
  trafficSource.medium,
  device.deviceCategory,
  geoNetwork.country,
  hits
FROM
  `bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE
  _TABLE_SUFFIX BETWEEN '20160801' AND '20170731'
"""

# Execute the query and load data into a DataFrame
logger.info("Executing BigQuery and loading data...")
df = client.query(query).to_dataframe()
logger.info(f"Data loaded. Shape: {df.shape}")



INFO:__main__:Using project ID: predictive-behavior-analytics
INFO:__main__:Executing BigQuery and loading data...
I0000 00:00:1724000928.014292 2043395 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


In [None]:
# Data Cleaning functions
def safe_json_loads(x):
    try:
        return json.loads(x) if isinstance(x, str) else x
    except json.JSONDecodeError:
        return {}

def flatten_nested_columns(df):
    nested_columns = ['totals', 'trafficSource', 'device', 'geoNetwork']
    for col in nested_columns:
        if col in df.columns:
            try:
                flattened = pd.json_normalize(df[col].apply(safe_json_loads))
                flattened.columns = [f'{col}_{subcol}' for subcol in flattened.columns]
                df = pd.concat([df.drop(col, axis=1), flattened], axis=1)
            except Exception as e:
                logger.warning(f"Error flattening column {col}: {str(e)}")
    return df

def extract_hit_level_data(df):
    try:
        hits_data = df.apply(lambda row: pd.json_normalize(safe_json_loads(row['hits'])), axis=1).explode().reset_index()
        hits_data = hits_data.rename(columns={'index': 'session_id'})
        
        hits_data['date'] = df['date'].repeat(df['hits'].apply(lambda x: len(safe_json_loads(x)))).reset_index(drop=True)
        hits_data['date'] = pd.to_datetime(hits_data['date'], format='%Y%m%d')
        
        columns_to_keep = [
            'session_id', 'date',
            'hitNumber', 'time', 'hour', 'minute',
            'isEntrance', 'isExit', 'page.pagePath', 'page.pageTitle',
            'eventInfo.eventCategory', 'eventInfo.eventAction', 'eventInfo.eventLabel',
            'transaction.transactionId', 'transaction.transactionRevenue',
            'item.productName', 'item.productCategory', 'item.productSKU', 'item.itemRevenue'
        ]
        
        hits_df = hits_data[columns_to_keep].copy()
        
        numeric_columns = ['time', 'hour', 'minute', 'transaction.transactionRevenue', 'item.itemRevenue']
        for col in numeric_columns:
            if col in hits_df.columns:
                hits_df[col] = pd.to_numeric(hits_df[col], errors='coerce')
        
        boolean_columns = ['isEntrance', 'isExit']
        for col in boolean_columns:
            if col in hits_df.columns:
                hits_df[col] = hits_df[col].astype(bool)
        
        return hits_df
    except Exception as e:
        logger.error(f"Error in extract_hit_level_data: {str(e)}")
        return pd.DataFrame()

def clean_data(df):
    df_cleaned = flatten_nested_columns(df)
    logger.info(f"Flattened DataFrame shape: {df_cleaned.shape}")

    df_cleaned['date'] = pd.to_datetime(df_cleaned['date'], format='%Y%m%d')

    if 'geoNetwork_country' in df_cleaned.columns:
        df_cleaned['country'] = df_cleaned['geoNetwork_country']

    numeric_columns = df_cleaned.select_dtypes(include=[np.number]).columns
    categorical_columns = df_cleaned.select_dtypes(exclude=[np.number, 'datetime64']).columns

    logger.info(f"Number of numeric columns: {len(numeric_columns)}")
    logger.info(f"Number of categorical columns: {len(categorical_columns)}")

    for col in df_cleaned.columns:
        if col in numeric_columns:
            df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
        elif col in categorical_columns:
            df_cleaned[col] = df_cleaned[col].astype(str)

    all_nan_columns = df_cleaned.columns[df_cleaned.isna().all()].tolist()
    if all_nan_columns:
        logger.warning(f"Columns with all NaN values: {all_nan_columns}")
        df_cleaned = df_cleaned.drop(columns=all_nan_columns)
        numeric_columns = [col for col in numeric_columns if col not in all_nan_columns]
        categorical_columns = [col for col in categorical_columns if col not in all_nan_columns]

    numeric_imputer = SimpleImputer(strategy='median')
    categorical_imputer = SimpleImputer(strategy='most_frequent')

    try:
        logger.info(f"Shape of numeric data before imputation: {df_cleaned[numeric_columns].shape}")
        imputed_numeric = numeric_imputer.fit_transform(df_cleaned[numeric_columns])
        logger.info(f"Shape of imputed numeric data: {imputed_numeric.shape}")
        df_cleaned[numeric_columns] = imputed_numeric
        logger.info("Numeric imputation successful")
    except Exception as e:
        logger.error(f"Error during numeric imputation: {str(e)}")
        logger.info("Numeric columns:")
        logger.info(numeric_columns.tolist())

    try:
        df_cleaned[categorical_columns] = categorical_imputer.fit_transform(df_cleaned[categorical_columns])
        logger.info("Categorical imputation successful")
    except Exception as e:
        logger.error(f"Error during categorical imputation: {str(e)}")
        logger.info("Categorical columns:")
        logger.info(categorical_columns.tolist())

    scaler = StandardScaler()
    df_cleaned[numeric_columns] = scaler.fit_transform(df_cleaned[numeric_columns])

    le = LabelEncoder()
    for col in categorical_columns:
        df_cleaned[col] = le.fit_transform(df_cleaned[col].astype(str))

    logger.info(f"Final cleaned session-level DataFrame shape: {df_cleaned.shape}")

    hit_level_df = extract_hit_level_data(df)
    logger.info(f"Hit-level DataFrame shape: {hit_level_df.shape}")

    return df_cleaned, hit_level_df



In [None]:
# Feature Engineering
def engineer_features(df_cleaned, hit_level_df):
    logger.info("Starting feature engineering")
    
    df_engineered = df_cleaned.copy()
    
    # Create time-based features
    df_engineered['day_of_week'] = df_engineered['date'].dt.dayofweek
    df_engineered['is_weekend'] = df_engineered['day_of_week'].isin([5, 6]).astype(int)
    df_engineered['month'] = df_engineered['date'].dt.month
    df_engineered['quarter'] = df_engineered['date'].dt.quarter
    
    # Create engagement features
    df_engineered['session_duration_seconds'] = df_engineered['totals_timeOnSite']
    df_engineered['pageviews_per_session'] = df_engineered['totals_pageviews']
    df_engineered['is_bounce'] = (df_engineered['totals_bounces'] > 0).astype(int)
    
    # Create device type features
    df_engineered['is_mobile'] = (df_engineered['device_deviceCategory'] == 'mobile').astype(int)
    df_engineered['is_tablet'] = (df_engineered['device_deviceCategory'] == 'tablet').astype(int)
    df_engineered['is_desktop'] = (df_engineered['device_deviceCategory'] == 'desktop').astype(int)
    
    # Create traffic source features
    df_engineered['is_organic_search'] = (df_engineered['trafficSource_medium'] == 'organic').astype(int)
    df_engineered['is_paid_search'] = (df_engineered['trafficSource_medium'] == 'cpc').astype(int)
    df_engineered['is_referral'] = (df_engineered['trafficSource_medium'] == 'referral').astype(int)
    
    # Create geographical features
    df_engineered['is_us'] = (df_engineered['geoNetwork_country'] == 'United States').astype(int)
    
    # Process hit-level data
    logger.info("Processing hit-level data")
    hit_level_features = hit_level_df.groupby('session_id').agg({
        'time': ['count', 'mean', 'max'],
        'isEntrance': 'sum',
        'isExit': 'sum',
        'eventInfo.eventCategory': 'nunique',
        'transaction.transactionId': 'nunique',
        'transaction.transactionRevenue': 'sum',
        'item.productName': 'nunique',
    })
    
    hit_level_features.columns = [
        'total_hits', 'avg_time_per_hit', 'max_time_per_hit',
        'num_entrance_pages', 'num_exit_pages', 'num_unique_events',
        'num_transactions', 'total_revenue', 'num_unique_products_viewed'
    ]
    
    df_engineered = df_engineered.merge(hit_level_features, left_on='session_id', right_index=True, how='left')
    
    # Create ratio features
    df_engineered['avg_pageviews_per_session'] = df_engineered['pageviews_per_session'] / df_engineered['totals_visits'].replace(0, 1)
    df_engineered['conversion_rate'] = df_engineered['num_transactions'] / df_engineered['totals_visits'].replace(0, 1)
    df_engineered['avg_revenue_per_session'] = df_engineered['total_revenue'] / df_engineered['totals_visits'].replace(0, 1)
    
    # Create user segments
    df_engineered['user_value_segment'] = pd.qcut(df_engineered['total_revenue'].fillna(0), q=4, labels=['Low', 'Medium', 'High', 'VIP'])
    df_engineered['engagement_segment'] = pd.qcut(df_engineered['total_hits'].fillna(0), q=3, labels=['Low', 'Medium', 'High'])
    
    logger.info("Feature engineering completed successfully")
    logger.info(f"Engineered DataFrame shape: {df_engineered.shape}")
    
    return df_engineered



In [None]:
# Main execution
if __name__ == "__main__":
    # Clean the data
    df_cleaned, hit_level_df = clean_data(df)
    
    # Engineer features
    df_engineered = engineer_features(df_cleaned, hit_level_df)
    
    # Create target variable
    df_engineered['made_purchase'] = (df_engineered['totals_transactions'] > 0).astype(int)
    
    # Select features for modeling
    features_for_modeling = [
        'session_duration_seconds', 'pageviews_per_session', 'is_bounce',
        'is_mobile', 'is_tablet', 'is_desktop',
        'is_organic_search', 'is_paid_search', 'is_referral',
        'is_us', 'total_hits', 'avg_time_per_hit', 'max_time_per_hit',
        'num_entrance_pages', 'num_exit_pages', 'num_unique_events',
        'avg_pageviews_per_session', 'conversion_rate', 'avg_revenue_per_session',
        'is_weekend', 'month', 'quarter'
    ]
    
    # Prepare final dataset for modeling
    X = df_engineered[features_for_modeling]
    y = df_engineered['made_purchase']
    
    logger.info(f"Final dataset shape: X: {X.shape}, y: {y.shape}")
    logger.info("Data preparation completed. Ready for modeling.")

    # Optional: Save the prepared dataset
    X.to_csv('prepared_features.csv', index=False)
    y.to_csv('target_variable.csv', index=False)
    logger.info("Prepared dataset saved to CSV files.")