In [None]:
!pip install pandas numpy scikit-learn matplotlib seaborn

In [32]:
import pandas as pd
import os
from pathlib import Path
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold
from datetime import datetime

### DATA CLEANING

#### MERGE ALL DATA

In [33]:
def merge_csv_files(directory_path):
    # Get all CSV files in the directory
    csv_files = list(Path(directory_path).glob('*.csv'))
    
    # Initialize list to store dataframes
    dfs = []
    
    # Read each CSV file and add filename as a column
    for file in csv_files:
        df = pd.read_csv(file, header=0)
        filename = file.stem  # Get filename without extension
        df['Event Code'] = filename
        dfs.append(df)
    
    # Concatenate all dataframes
    merged_df = pd.concat(dfs, ignore_index=True)

    # Filter out "Booker not confirmed" rows
    merged_df = merged_df[merged_df['Attendee Status'] != 'Booker not attending']
    
    # Export merged dataframe
    output_path = os.path.join("/workspaces/collaborative_app_dev/Data/Cleaned Data", 'merged_data.csv')
    merged_df.to_csv(output_path, index=False)
    
    return output_path

In [None]:
# Usage
directory = "/workspaces/collaborative_app_dev/Data/Raw Data"
output_file = merge_csv_files(directory)

df = pd.read_csv("/workspaces/collaborative_app_dev/Data/Cleaned Data/merged_data.csv")
print(len(df))

#### GROUP TICKETS BY DATE CREATED BASED ON EVENT

In [35]:
def create_event_dates_dict():
    # Dictionary mapping event codes to their dates and target audience
    return {
        'D19': {'date': '2019-11-19', 'audience': 'IT Managers'},
        'D21': {'date': '2021-12-09', 'audience': 'IT Managers'},
        'D24': {'date': '2024-10-03', 'audience': 'IT Managers'},
        'GP21': {'date': '2021-04-22', 'audience': 'Property Managers'},
        'GP24': {'date': '2024-09-11', 'audience': 'Property Managers'},
        'MSE21': {'date': '2021-03-24', 'audience': 'Education property managers'},
        'NP21': {'date': '2021-11-09', 'audience': 'Property Managers'},
        'NP24': {'date': '2024-11-06', 'audience': 'Property Managers'},
        'SRM22': {'date': '2022-06-15', 'audience': 'Education Managers'},
        'SRM23': {'date': '2023-06-08', 'audience': 'Education Managers'}
    }

In [36]:
def analyze_registrations(merged_csv_path):
    # Read the merged CSV
    df = pd.read_csv(merged_csv_path)
    
    # Convert date_created to datetime if it's not already
    df['Created Date'] = pd.to_datetime(df['Created Date'])
    
    # Group by date and event (Event Code) and count registrations
    registration_counts = df.groupby([
        df['Created Date'].dt.date,
        'Event Code'
    ]).size().reset_index(name='registration_count')
    
    # Sort by date and event
    registration_counts = registration_counts.sort_values(['Created Date', 'Event Code'])

#-----------------------------------------------------------------------------------
    # Calculate cumulative registrations for each event separately
    registration_counts['cumulative_registrations'] = registration_counts.groupby('Event Code')['registration_count'].cumsum()
    

    # Add event dates and target audience
    event_dates = create_event_dates_dict()
    
    # Add event date and target audience columns
    registration_counts['Event date'] = registration_counts['Event Code'].map(
        {k: pd.to_datetime(v['date']) for k, v in event_dates.items()}
    )

    # Remove registrations after event date
    registration_counts = registration_counts[registration_counts['Created Date'] <= registration_counts['Event date']]
    
    registration_counts['Target audience'] = registration_counts['Event Code'].map(
        {k: v['audience'] for k, v in event_dates.items()}
    )

    # Calculate days until event
    registration_counts['Days until event'] = (
        registration_counts['Event date'] - pd.to_datetime(registration_counts['Created Date'])
    ).dt.days


    # Sort by date and event for final output
    registration_counts = registration_counts.sort_values(['Created Date', 'Event Code'])
#--------------------------------------------------------------------------------------


    # Detect promotional spikes
    mean_daily = registration_counts['registration_count'].mean()
    std_daily = registration_counts['registration_count'].std()
    registration_counts['promotional_spike'] = (registration_counts['registration_count'] > 
                                        (mean_daily + 2 * std_daily)).astype(int)
    
    # ---------------------------------------------------------------------------------------

    
    # Export the analysis
    analysis_path = os.path.join(os.path.dirname("/workspaces/collaborative_app_dev/Data/Cleaned Data/"), 'complete_registration_analysis.csv')
    registration_counts.to_csv(analysis_path, index=False)
    
    return analysis_path

In [None]:
analysis_path = analyze_registrations("/workspaces/collaborative_app_dev/Data/Cleaned Data/merged_data.csv")

df = pd.read_csv(analysis_path)
df.head(10)

### FEATURE ENGINEERING AND EXTRACTION

In [38]:
data = pd.read_csv("/workspaces/collaborative_app_dev/Data/Cleaned Data/complete_registration_analysis.csv")

In [None]:
data.head()

In [None]:
data['Event date']

In [70]:
def extract_features(df):
    """Extract features with enhanced pattern detection"""
    # days = week_number * 7 #SHOULD BE EVENT DATE - CURRENT DATE

    # Calculate days until event
    days = (df['Event date'] - pd.to_datetime(datetime.now())).dt.days

    features = []
    targets = []

    for event_code in df['Event Code'].unique():
        event_df = df[df['Event Code'] == event_code].copy()
        #event_df['days_from_start'] = (event_df['date'] - event_df['date'].min()).dt.days
        
        if event_df['Days until event'].max() < days:
            continue
        
        #### LOOK AT THIS!!!!
        registrations_at_point = event_df[event_df['Days until event'] >= days]
        
        if len(registrations_at_point) == 0:
            continue


        # Enhanced feature extraction
        recent_registrations = registrations_at_point.tail(7)['registration_count'].mean()
        early_registrations = registrations_at_point.head(7)['registration_count'].mean()

        #--------------------------------------------------------------------------------------------
        # Calculate days since last spike
        spike_mask = registrations_at_point['promotional_spike'] == 1
        if any(spike_mask):
            # Get the most recent spike's days until event
            last_spike_days = registrations_at_point[spike_mask]['Days until event'].max()
            # Calculate the difference between current point and last spike
            days_since_spike = registrations_at_point['Days until event'].max() - last_spike_days
        else:
            # If no spikes, use 0 to indicate no prior spikes have occurred
            days_since_spike = 0
        #---------------------------------------------------------------------------------------------


        feature_dict = {
            'current_registrations': registrations_at_point['cumulative_registrations'].max(),
            'avg_daily_rate': registrations_at_point['registration_count'].mean(),
            'recent_velocity': recent_registrations,
            'early_velocity': early_registrations,
            'registration_acceleration': (recent_registrations - early_registrations) / 7,
            'days_active': len(registrations_at_point),
            'peak_daily_registrations': registrations_at_point['registration_count'].max(),
            'registration_volatility': registrations_at_point['registration_count'].std(),
            #-----------------------------------------------------------------------------
            'spike_count': registrations_at_point['promotional_spike'].sum(),
            'days_since_last_spike': days_since_spike,
            'event_start_date': registrations_at_point['Event date'],
            'target_audience': registrations_at_point['Target audience'],
            'event_code': registrations_at_point['Event Code'],
            }


        features.append(feature_dict)
        targets.append(event_df['cumulative_registrations'].max())

    return pd.DataFrame(features), np.array(targets)

In [71]:
current_date = pd.to_datetime('today')

features, targets = extract_features(df=data)

  days = (df['Event date'] - df['Created Date']).dt.days


TypeError: unsupported operand type(s) for -: 'Timestamp' and 'str'

In [None]:
features.head(10)

In [None]:
print(targets)

### TRAINING

### PREDICTION