<a href="https://colab.research.google.com/github/tomomitanaka00/Blog-SQL/blob/main/Revenue_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
from google.colab import files
uploaded = files.upload()

Saving predictive-behavior-analytics-b509bad93e58.json to predictive-behavior-analytics-b509bad93e58 (3).json


In [25]:
!pip install dask[dataframe]




In [26]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFE, RFECV
import lightgbm as lgb
import matplotlib.pyplot as plt
import logging
import os
from datetime import datetime, timedelta
import gc

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Set up credentials and BigQuery client
credentials = service_account.Credentials.from_service_account_file('predictive-behavior-analytics-b509bad93e58.json')
project_id = "predictive-behavior-analytics"
client = bigquery.Client(credentials=credentials, project=project_id)

def process_chunks(df, chunk_size=1000):
    for i in range(0, len(df), chunk_size):
        chunk_df = df.iloc[i:i+chunk_size]
        yield clean_and_engineer_data(chunk_df)


def fetch_and_process_data(start_date, end_date, chunk_size=7, output_dir='processed_data'):
    start = datetime.strptime(start_date, '%Y%m%d')
    end = datetime.strptime(end_date, '%Y%m%d')

    os.makedirs(output_dir, exist_ok=True)

    current_start = start
    chunk_number = 1

    while current_start <= end:
        current_end = min(current_start + timedelta(days=chunk_size - 1), end)

        query = f"""
        SELECT
          CONCAT(fullVisitorId, CAST(visitId AS STRING)) AS session_id,
          date,
          totals.timeOnSite,
          totals.pageviews,
          totals.transactions,
          totals.transactionRevenue,  # Ensure this field is included
          trafficSource.source,
          trafficSource.medium,
          device.deviceCategory,
          geoNetwork.country,
          hits
        FROM
          `bigquery-public-data.google_analytics_sample.ga_sessions_*`
        WHERE
          _TABLE_SUFFIX BETWEEN '{current_start.strftime('%Y%m%d')}' AND '{current_end.strftime('%Y%m%d')}'
        """

        logger.info(f"Fetching data from {current_start.strftime('%Y-%m-%d')} to {current_end.strftime('%Y-%m-%d')}")

        df_chunk = client.query(query).to_dataframe()
        logger.info(f"Fetched chunk with shape: {df_chunk.shape}")

        # Reduce the dataset to 10% for processing efficiency
        df_chunk = df_chunk.sample(frac=0.1, random_state=42)

        # Save processed chunk to disk
        output_file = os.path.join(output_dir, f'processed_chunk_{chunk_number}.parquet')
        df_chunk.to_parquet(output_file)
        logger.info(f"Saved processed chunk to {output_file}")

        current_start = current_end + timedelta(days=1)
        chunk_number += 1

gc.collect()

def prepare_data_for_modeling(df):
    logger.info("Preparing data for revenue prediction...")

    # Check if necessary columns exist
    if 'totals_transactionRevenue' not in df.columns:
        logger.error("The required 'totals_transactionRevenue' column is missing from the dataset.")
        raise KeyError("Required 'totals_transactionRevenue' column not found.")

    if 'log_revenue' not in df.columns:
        logger.warning("The 'log_revenue' column is missing. Attempting to create it.")
        df['totals_transactionRevenue'] = df['totals_transactionRevenue'].replace(0, 1e-6)
        df['log_revenue'] = np.log1p(df['totals_transactionRevenue'])

    # Continue with your existing processing logic
    y = df['log_revenue']

    # Convert Dask DataFrame to Pandas DataFrame
    X = df.drop(['log_revenue', 'totals_transactionRevenue'], axis=1).compute()
    y = y.compute()



# Function to load and combine processed chunks using Dask
def load_and_combine_chunks(directory='processed_data'):
    # Read all the Parquet files at once using Dask
    ddf = dd.read_parquet(os.path.join(directory, '*.parquet'))
    return ddf

def safe_json_loads(x):
    try:
        return json.loads(x) if isinstance(x, str) else x
    except json.JSONDecodeError:
        return {}

def flatten_nested_columns(df):
    nested_columns = ['totals', 'trafficSource', 'device', 'geoNetwork']
    for col in nested_columns:
        if col in df.columns:
            try:
                flattened = pd.json_normalize(df[col].apply(safe_json_loads))
                flattened.columns = [f'{col}_{subcol}' for subcol in flattened.columns]
                df = pd.concat([df.drop(col, axis=1), flattened], axis=1)
            except Exception as e:
                logger.warning(f"Error flattening column {col}: {str(e)}")
    return df

def optimize_dtypes(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            try:
                df[col] = df[col].astype('category')
            except TypeError:
                pass
        elif df[col].dtype == 'float64':
            # Convert the column to float32 directly
            df[col] = df[col].astype('float32')
        elif df[col].dtype == 'int64':
            # Convert to Int32 instead of Int64, or handle as float
            df[col] = df[col].astype('int32')  # Using 'int32' if you prefer integers
    return df




In [27]:

def clean_and_engineer_data(df):
    logger.info("Starting data cleaning and feature engineering...")

    # Flatten any nested columns
    df_cleaned = flatten_nested_columns(df)
    logger.info(f"Flattened DataFrame shape: {df_cleaned.shape}")

    # Identify columns with complex data types (arrays, lists, etc.)
    complex_columns = []
    for col in df_cleaned.columns:
        sample = df_cleaned[col].head(100)
        if sample.apply(lambda x: isinstance(x, (list, np.ndarray))).any():
            logger.warning(f"Column '{col}' contains complex data types. Dropping this column.")
            complex_columns.append(col)

    # Drop complex columns
    df_cleaned = df_cleaned.drop(columns=complex_columns)

    # Optimize data types to reduce memory usage
    df_cleaned = optimize_dtypes(df_cleaned)

    df_cleaned['date'] = pd.to_datetime(df_cleaned['date'], format='%Y%m%d')
    numeric_columns = df_cleaned.select_dtypes(include=[np.number]).columns
    categorical_columns = df_cleaned.select_dtypes(exclude=[np.number, 'datetime64']).columns

    # Handle all-NaN columns
    all_nan_columns = df_cleaned.columns[df_cleaned.isna().all()].tolist()
    if all_nan_columns:
        df_cleaned = df_cleaned.drop(columns=all_nan_columns)
        numeric_columns = [col for col in numeric_columns if col not in all_nan_columns]
        categorical_columns = [col for col in categorical_columns if col not in all_nan_columns]

    # Imputation
    for col in numeric_columns:
        df_cleaned[col] = df_cleaned[col].astype('float32')  # Ensure the column is float
        df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].mean())  # Fill NaN with mean

    for col in categorical_columns:
        df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].mode().iloc[0])

    # Feature engineering
    df_cleaned['day_of_week'] = df_cleaned['date'].dt.dayofweek
    df_cleaned['is_weekend'] = df_cleaned['day_of_week'].isin([5, 6]).astype(int)
    df_cleaned['month'] = df_cleaned['date'].dt.month
    df_cleaned['quarter'] = df_cleaned['date'].dt.quarter

    logger.info(f"Final cleaned and engineered DataFrame shape: {df_cleaned.shape}")
    return df_cleaned

    # Feature engineering
    df_cleaned['day_of_week'] = df_cleaned['date'].dt.dayofweek
    df_cleaned['is_weekend'] = df_cleaned['day_of_week'].isin([5, 6]).astype(int)
    df_cleaned['month'] = df_cleaned['date'].dt.month
    df_cleaned['quarter'] = df_cleaned['date'].dt.quarter

    logger.info(f"Final cleaned and engineered DataFrame shape: {df_cleaned.shape}")
    return df_cleaned

In [28]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor
import logging

logger = logging.getLogger(__name__)

def prepare_data_for_modeling(df):
    logger.info("Preparing data for revenue prediction...")

    # Define potential feature columns
    numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # Exclude certain columns
    exclude_columns = ['session_id', 'date', 'totals_transactionRevenue', 'log_revenue', 'totals_totalTransactionRevenue']
    feature_columns = [col for col in numeric_columns + categorical_columns if col not in exclude_columns]

    logger.info(f"Using the following features: {feature_columns}")

    if not feature_columns:
        raise ValueError("No valid features found in the DataFrame")

    X = df[feature_columns]

    # Check if the 'log_revenue' column exists
    if 'log_revenue' not in df.columns:
        logger.warning("The column 'log_revenue' is missing from the DataFrame. Attempting to create it.")
        if 'totals_transactionRevenue' in df.columns:
            df['totals_transactionRevenue'] = df['totals_transactionRevenue'].replace(0, 1e-6)
            df['log_revenue'] = np.log1p(df['totals_transactionRevenue'])
        else:
            logger.error("Both 'log_revenue' and 'totals_transactionRevenue' columns are missing. Cannot proceed with modeling.")
            raise KeyError("Required revenue columns not found.")

    y = df['log_revenue']

    # Convert Dask DataFrame to Pandas DataFrame
    X = X.compute()
    y = y.compute()

    # Handle missing values
    for col in numeric_columns:
        X[col] = X[col].fillna(X[col].mean())

    for col in categorical_columns:
        most_common = X[col].value_counts().nlargest(1).index[0]
        X[col] = X[col].fillna(most_common)

    # Encode categorical variables
    for col in categorical_columns:
        X[col] = pd.Categorical(X[col]).codes

    # Perform feature selection using RFE
    n_features_to_select = min(30, X.shape[1])
    rfe_selector = RFECV(estimator=RandomForestRegressor(n_estimators=30, random_state=42),
                         step=1, cv=3, scoring='neg_mean_squared_error')
    X_selected = rfe_selector.fit_transform(X, y)

    # Update feature_columns to reflect only the selected features
    selected_columns = pd.Series(feature_columns)[rfe_selector.get_support()].tolist()

    logger.info(f"Final feature matrix shape: {X_selected.shape}")

    return X_selected, y, selected_columns

In [29]:
def train_and_evaluate_random_forest(X, y):
    logger.info("Training and evaluating Random Forest model...")

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define Random Forest model
    rf_model = RandomForestRegressor(random_state=42)

    # Hyperparameter tuning
    param_dist = {
        'n_estimators': [30, 40, 50],
        'max_depth': [3, 5, 7, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    random_search = RandomizedSearchCV(rf_model, param_distributions=param_dist,
                                       n_iter=10, cv=5, random_state=42, n_jobs=-1, verbose=2)
    random_search.fit(X_train, y_train)

    # Use best model
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results = {
        'model': best_model,
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'r2': r2
    }

    logger.info(f"Random Forest - RMSE: {rmse:.3f}, R²: {r2:.3f}")

    return results

In [None]:
import logging
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def train_and_evaluate_random_forest(X, y):
    logger.info("Training and evaluating Random Forest model...")

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define Random Forest model
    rf_model = RandomForestRegressor(random_state=42)

    # Hyperparameter tuning
    param_dist = {
        'n_estimators': [30, 40, 50],
        'max_depth': [3, 5, 7, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    random_search = RandomizedSearchCV(rf_model, param_distributions=param_dist,
                                       n_iter=10, cv=5, random_state=42, n_jobs=-1, verbose=2)
    random_search.fit(X_train, y_train)

    # Use best model
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results = {
        'model': best_model,
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'r2': r2
    }

    logger.info(f"Random Forest - RMSE: {rmse:.3f}, R²: {r2:.3f}")

    return results

# Main execution
if __name__ == "__main__":
    start_date = '20160801'
    end_date = '20170731'
    chunk_size = 1  # Fetch 1 day of data at a time to reduce memory usage

    logger.info("Starting data fetch and processing...")
    fetch_and_process_data(start_date, end_date, chunk_size)
    logger.info("Finished fetching and processing all data.")

    logger.info("Loading and processing data using Dask...")
    ddf = load_and_combine_chunks()  # Load all chunks at once with Dask

    # Convert Dask DataFrame to pandas DataFrame
    df = ddf.compute()

    # Process data in chunks
    processed_chunks = list(process_chunks(df))
    final_df = pd.concat(processed_chunks)

    # Optionally, save the processed data
    final_df.to_parquet('cleaned_engineered_data.parquet', index=False)

    # Prepare data for modeling
    X, y, selected_columns = prepare_data_for_modeling(final_df)

    # Train and evaluate Random Forest model
    results = train_and_evaluate_random_forest(X, y)

    # Print final results
    logger.info("Random Forest Model Results:")
    logger.info(f"RMSE: {results['rmse']:.3f}")
    logger.info(f"MAE: {results['mae']:.3f}")
    logger.info(f"R²: {results['r2']:.3f}")

    # Optionally, you can save the model
    joblib.dump(results['model'], 'random_forest_model.joblib')
    logger.info("Random Forest model saved as 'random_forest_model.joblib'")

    logger.info("Revenue prediction analysis with Random Forest completed successfully.")