In [2]:
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd
import numpy as np
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set up credentials
credentials = service_account.Credentials.from_service_account_file(
    '/Users/ttanaka/Downloads/predictive-behavior-analytics-b509bad93e58.json'
)

# Set the correct project ID
project_id = "predictive-behavior-analytics"

# Log the project ID being used
logger.info(f"Using project ID: {project_id}")

# Create a BigQuery client
client = bigquery.Client(credentials=credentials, project=project_id)

# Your BigQuery query
query = """
SELECT
  *
FROM
  `bigquery-public-data.google_analytics_sample.ga_sessions_20170801`
LIMIT 2500
"""

# Execute the query and load data into a DataFrame
logger.info("Executing BigQuery and loading data...")
df = client.query(query).to_dataframe()
logger.info(f"Data loaded. Shape: {df.shape}")

import json

# Print information about the DataFrame
logger.info("\nDataFrame info:")
df.info()

# Print memory usage
logger.info(f"\nMemory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


INFO:__main__:Using project ID: predictive-behavior-analytics
INFO:__main__:Executing BigQuery and loading data...
I0000 00:00:1723995332.202424 1987039 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
INFO:__main__:Data loaded. Shape: (2500, 16)
INFO:__main__:
DataFrame info:
INFO:__main__:
Memory usage: 6.97 MB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   visitorId             0 non-null      Int64 
 1   visitNumber           2500 non-null   Int64 
 2   visitId               2500 non-null   Int64 
 3   visitStartTime        2500 non-null   Int64 
 4   date                  2500 non-null   object
 5   totals                2500 non-null   object
 6   trafficSource         2500 non-null   object
 7   device                2500 non-null   object
 8   geoNetwork            2500 non-null   object
 9   customDimensions      2500 non-null   object
 10  hits                  2500 non-null   object
 11  fullVisitorId         2500 non-null   object
 12  userId                0 non-null      object
 13  clientId              0 non-null      object
 14  channelGrouping       2500 non-null   object
 15  socialEngagementType  2500 non-null   

In [3]:
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
import logging
import json
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set up credentials
credentials = service_account.Credentials.from_service_account_file(
    '/Users/ttanaka/Downloads/predictive-behavior-analytics-b509bad93e58.json'
)

# Set the correct project ID
project_id = "predictive-behavior-analytics"

# Log the project ID being used
logger.info(f"Using project ID: {project_id}")

# Create a BigQuery client
client = bigquery.Client(credentials=credentials, project=project_id)

# Your BigQuery query
query = """
SELECT
  *
FROM
  `bigquery-public-data.google_analytics_sample.ga_sessions_20170801`
LIMIT 10000
"""


INFO:__main__:Using project ID: predictive-behavior-analytics


In [4]:
# Data Cleaning functions
def safe_json_loads(x):
    try:
        return json.loads(x) if isinstance(x, str) else x
    except json.JSONDecodeError:
        return {}

def flatten_nested_columns(df):
    nested_columns = ['totals', 'trafficSource', 'device', 'geoNetwork']
    for col in nested_columns:
        if col in df.columns:
            try:
                flattened = pd.json_normalize(df[col].apply(safe_json_loads))
                flattened.columns = [f'{col}_{subcol}' for subcol in flattened.columns]
                df = pd.concat([df.drop(col, axis=1), flattened], axis=1)
            except Exception as e:
                logger.warning(f"Error flattening column {col}: {str(e)}")
    return df

def extract_hit_level_data(df):
    try:
        hits_data = df.apply(lambda row: pd.json_normalize(safe_json_loads(row['hits'])), axis=1).explode().reset_index()
        hits_data = hits_data.rename(columns={'index': 'session_id'})
        
        for col in ['fullVisitorId', 'visitId', 'date']:
            hits_data[col] = df[col].repeat(df['hits'].apply(lambda x: len(safe_json_loads(x)))).reset_index(drop=True)
        
        hits_data['date'] = pd.to_datetime(hits_data['date'], format='%Y%m%d')
        
        columns_to_keep = [
            'session_id', 'fullVisitorId', 'visitId', 'date',
            'hitNumber', 'time', 'hour', 'minute',
            'isEntrance', 'isExit', 'page.pagePath', 'page.pageTitle',
            'eventInfo.eventCategory', 'eventInfo.eventAction', 'eventInfo.eventLabel',
            'transaction.transactionId', 'transaction.transactionRevenue',
            'item.productName', 'item.productCategory', 'item.productSKU', 'item.itemRevenue'
        ]
        
        hits_df = hits_data[columns_to_keep].copy()
        
        numeric_columns = ['time', 'hour', 'minute', 'transaction.transactionRevenue', 'item.itemRevenue']
        for col in numeric_columns:
            if col in hits_df.columns:
                hits_df[col] = pd.to_numeric(hits_df[col], errors='coerce')
        
        boolean_columns = ['isEntrance', 'isExit']
        for col in boolean_columns:
            if col in hits_df.columns:
                hits_df[col] = hits_df[col].astype(bool)
        
        return hits_df
    except Exception as e:
        logger.error(f"Error in extract_hit_level_data: {str(e)}")
        return pd.DataFrame()

def clean_data(df):
    df_cleaned = flatten_nested_columns(df)
    logger.info(f"Flattened DataFrame shape: {df_cleaned.shape}")

    if 'visitorId' in df_cleaned.columns and df_cleaned['visitorId'].isna().all():
        df_cleaned = df_cleaned.drop('visitorId', axis=1)
        logger.info("Dropped empty 'visitorId' column")

    df_cleaned['date'] = pd.to_datetime(df_cleaned['date'], format='%Y%m%d')

    if 'geoNetwork_country' in df_cleaned.columns:
        df_cleaned['country'] = df_cleaned['geoNetwork_country']

    numeric_columns = df_cleaned.select_dtypes(include=[np.number]).columns
    categorical_columns = df_cleaned.select_dtypes(exclude=[np.number, 'datetime64']).columns

    logger.info(f"Number of numeric columns: {len(numeric_columns)}")
    logger.info(f"Number of categorical columns: {len(categorical_columns)}")

    for col in df_cleaned.columns:
        if col in numeric_columns:
            df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
        elif col in categorical_columns:
            df_cleaned[col] = df_cleaned[col].astype(str)

    all_nan_columns = df_cleaned.columns[df_cleaned.isna().all()].tolist()
    if all_nan_columns:
        logger.warning(f"Columns with all NaN values: {all_nan_columns}")
        df_cleaned = df_cleaned.drop(columns=all_nan_columns)
        numeric_columns = [col for col in numeric_columns if col not in all_nan_columns]
        categorical_columns = [col for col in categorical_columns if col not in all_nan_columns]

    numeric_imputer = SimpleImputer(strategy='median')
    categorical_imputer = SimpleImputer(strategy='most_frequent')

    try:
        logger.info(f"Shape of numeric data before imputation: {df_cleaned[numeric_columns].shape}")
        imputed_numeric = numeric_imputer.fit_transform(df_cleaned[numeric_columns])
        logger.info(f"Shape of imputed numeric data: {imputed_numeric.shape}")
        df_cleaned[numeric_columns] = imputed_numeric
        logger.info("Numeric imputation successful")
    except Exception as e:
        logger.error(f"Error during numeric imputation: {str(e)}")
        logger.info("Numeric columns:")
        logger.info(numeric_columns.tolist())

    try:
        df_cleaned[categorical_columns] = categorical_imputer.fit_transform(df_cleaned[categorical_columns])
        logger.info("Categorical imputation successful")
    except Exception as e:
        logger.error(f"Error during categorical imputation: {str(e)}")
        logger.info("Categorical columns:")
        logger.info(categorical_columns.tolist())

    scaler = StandardScaler()
    df_cleaned[numeric_columns] = scaler.fit_transform(df_cleaned[numeric_columns])

    le = LabelEncoder()
    for col in categorical_columns:
        df_cleaned[col] = le.fit_transform(df_cleaned[col].astype(str))

    logger.info(f"Final cleaned session-level DataFrame shape: {df_cleaned.shape}")

    hit_level_df = extract_hit_level_data(df)
    logger.info(f"Hit-level DataFrame shape: {hit_level_df.shape}")

    return df_cleaned, hit_level_df



In [5]:
# Feature Engineering function (corrected)
def engineer_features(df_cleaned, hit_level_df):
    logger.info("Starting feature engineering")
    
    required_columns = ['date', 'totals_timeOnSite', 'totals_pageviews', 'totals_bounces', 'device_deviceCategory', 'trafficSource_medium', 'geoNetwork_country', 'totals_visits']
    if not all(col in df_cleaned.columns for col in required_columns):
        missing_cols = [col for col in required_columns if col not in df_cleaned.columns]
        logger.error(f"Missing required columns in df_cleaned: {missing_cols}")
        raise ValueError(f"Missing required columns in df_cleaned: {missing_cols}")

    df_engineered = df_cleaned.copy()
    
    try:
        df_engineered['day_of_week'] = df_engineered['date'].dt.dayofweek
        df_engineered['is_weekend'] = df_engineered['day_of_week'].isin([5, 6]).astype(int)
        df_engineered['month'] = df_engineered['date'].dt.month
        df_engineered['quarter'] = df_engineered['date'].dt.quarter
        
        df_engineered['session_duration_seconds'] = df_engineered['totals_timeOnSite']
        df_engineered['pageviews_per_session'] = df_engineered['totals_pageviews']
        df_engineered['is_bounce'] = (df_engineered['totals_bounces'] > 0).astype(int)
        
        df_engineered['is_mobile'] = (df_engineered['device_deviceCategory'] == 'mobile').astype(int)
        df_engineered['is_tablet'] = (df_engineered['device_deviceCategory'] == 'tablet').astype(int)
        df_engineered['is_desktop'] = (df_engineered['device_deviceCategory'] == 'desktop').astype(int)
        
        df_engineered['is_organic_search'] = (df_engineered['trafficSource_medium'] == 'organic').astype(int)
        df_engineered['is_paid_search'] = (df_engineered['trafficSource_medium'] == 'cpc').astype(int)
        df_engineered['is_referral'] = (df_engineered['trafficSource_medium'] == 'referral').astype(int)
        
        df_engineered['is_us'] = (df_engineered['geoNetwork_country'] == 'United States').astype(int)
        
        logger.info("Processing hit-level data")
        hit_level_features = hit_level_df.groupby('fullVisitorId').agg({
            'time': ['count', 'mean', 'max'],
            'isEntrance': 'sum',
            'isExit': 'sum',
            'eventInfo.eventCategory': 'nunique',
            'transaction.transactionId': 'nunique',
            'transaction.transactionRevenue': 'sum',
            'item.productName': 'nunique',
        })
        
        hit_level_features.columns = [
            'total_hits', 'avg_time_per_hit', 'max_time_per_hit',
            'num_entrance_pages', 'num_exit_pages', 'num_unique_events',
            'num_transactions', 'total_revenue', 'num_unique_products_viewed'
        ]
        
        df_engineered = df_engineered.merge(hit_level_features, left_on='fullVisitorId', right_index=True, how='left')
        
        df_engineered['avg_pageviews_per_session'] = df_engineered['pageviews_per_session'] / df_engineered['totals_visits'].replace(0, 1)
        df_engineered['conversion_rate'] = df_engineered['num_transactions'] / df_engineered['totals_visits'].replace(0, 1)
        df_engineered['avg_revenue_per_session'] = df_engineered['total_revenue'] / df_engineered['totals_visits'].replace(0, 1)
        
        df_engineered['user_value_segment'] = pd.qcut(df_engineered['total_revenue'].fillna(0), q=4, labels=['Low', 'Medium', 'High', 'VIP'])
        df_engineered['engagement_segment'] = pd.qcut(df_engineered['total_hits'].fillna(0), q=3, labels=['Low', 'Medium', 'High'])
        
        logger.info("Feature engineering completed successfully")
        
        # Save the engineered DataFrame to a CSV file
        save_path = "df_engineered.csv"
        df_engineered.to_csv(save_path, index=False)
        logger.info(f"Engineered DataFrame saved to {save_path}")
        
        return df_engineered
    
    except Exception as e:
        logger.error(f"Error during feature engineering: {str(e)}")
        raise

# Usage example:
# df_cleaned, hit_level_df = clean_data(df)  # Assuming these functions are already defined
# df_engineered = engineer_features(df_cleaned, hit_level_df)


In [6]:
try:
    save_path = "df_engineered.csv"
    df_engineered.to_csv(save_path, index=False)
    logger.info(f"Engineered DataFrame saved to {save_path}")
except Exception as e:
    logger.error(f"Failed to save the DataFrame: {str(e)}")
    raise


ERROR:__main__:Failed to save the DataFrame: name 'df_engineered' is not defined


NameError: name 'df_engineered' is not defined

In [None]:
# Optimize data types
df_cleaned = optimize_dtypes(df_cleaned)
hit_level_df = optimize_dtypes(hit_level_df)

# Log the final memory usage
logger.info(f"Optimized session-level DataFrame memory usage: {df_cleaned.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
logger.info(f"Optimized hit-level DataFrame memory usage: {hit_level_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


In [None]:
# Feature Selection function
def correlation_analysis(df, threshold=0.8):
    corr_matrix = df.corr(method='spearman')
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=False, cmap='coolwarm')
    plt.title('Feature Correlation Heatmap')
    plt.show()
    
    high_corr_vars = np.where(np.abs(corr_matrix) > threshold)
    high_corr_vars = [(corr_matrix.index[x], corr_matrix.columns[y]) for x, y in zip(*high_corr_vars) if x != y and x < y]
    
    return high_corr_vars



In [None]:
# Main execution
try:
    # Execute the query
    df = client.query(query).to_dataframe()
    logger.info(f"Data loaded successfully. Shape: {df.shape}")
    print("Original dataframe columns:")
    print(df.columns)

    # Data Cleaning
    df_cleaned, hit_level_df = clean_data(df)
    print("\nCleaned dataframe columns:")
    print(df_cleaned.columns)

    # Feature Engineering
    print("\nStarting feature engineering...")
    df_engineered = engineer_features(df_cleaned, hit_level_df)
    print("Feature engineering completed.")
    print("\nEngineered dataframe columns:")
    print(df_engineered.columns)

    # Feature Selection
    print("\nStarting correlation analysis...")
    high_corr_features = correlation_analysis(df_engineered)
    print("Correlation analysis completed.")
    print("\nHighly correlated feature pairs:")
    for feat1, feat2 in high_corr_features:
        print(f"{feat1} - {feat2}")

except Exception as e:
    print(f"An error occurred during the process: {str(e)}")
    import traceback
    traceback.print_exc()

# Main execution
try:
    # Data Cleaning
    df_cleaned, hit_level_df = clean_data(df)
    print("\nCleaned dataframe columns:")
    print(df_cleaned.columns)

    # Feature Engineering
    print("\nStarting feature engineering...")
    df_engineered = engineer_features(df_cleaned, hit_level_df)
    print("Feature engineering completed.")
    print("\nEngineered dataframe columns:")
    print(df_engineered.columns)

    # Feature Selection
    print("\nStarting correlation analysis...")
    high_corr_features = correlation_analysis(df_engineered)
    print("Correlation analysis completed.")
    print("\nHighly correlated feature pairs:")
    for feat1, feat2 in high_corr_features:
        print(f"{feat1} - {feat2}")

except Exception as e:
    print(f"An error occurred during the process: {str(e)}")
    import traceback
    traceback.print_exc()