### IMPORT REQUIRED MODULES

In [1]:
!pip install pandas numpy scikit-learn matplotlib seaborn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [2]:
import pandas as pd
import os
from pathlib import Path
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# from sklearn.model_selection import GroupKFold
from sklearn.model_selection import cross_val_score, RepeatedKFold, LeaveOneOut
from datetime import datetime, date
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

### DATA CLEANING

#### MERGE ALL DATA

In [3]:
def merge_csv_files(directory_path):
    # Get all CSV files in the directory
    csv_files = list(Path(directory_path).glob('*.csv'))
    
    # Initialize list to store dataframes
    dfs = []
    
    # Read each CSV file and add filename as a column
    for file in csv_files:
        df = pd.read_csv(file, header=0)
        filename = file.stem  # Get filename without extension
        df['Event Code'] = filename
        dfs.append(df)
    
    # Concatenate all dataframes
    merged_df = pd.concat(dfs, ignore_index=True)

    # Filter out "Booker not confirmed" rows
    merged_df = merged_df[merged_df['Attendee Status'] != 'Booker not attending']
    
    # Export merged dataframe
    output_path = os.path.join("/workspaces/collaborative_app_dev/Data/Cleaned Data", 'merged_data.csv')
    merged_df.to_csv(output_path, index=False)
    
    return output_path

In [4]:
# Usage
directory = "/workspaces/collaborative_app_dev/Data/Raw Data"
output_file = merge_csv_files(directory)
df = pd.read_csv("/workspaces/collaborative_app_dev/Data/Cleaned Data/merged_data.csv")
print(len(df))

7417


#### GROUP TICKETS BY DATE CREATED BASED ON EVENT

In [5]:
def create_event_dates_dict():
    # Dictionary mapping event codes to their dates and target audience
    return {
        'D19': {'date': '2019-11-19', 'audience': 'IT Managers'},
        'D21': {'date': '2021-12-09', 'audience': 'IT Managers'},
        'D24': {'date': '2024-10-03', 'audience': 'IT Managers'},
        'GP21': {'date': '2021-04-22', 'audience': 'Property Managers'},
        'GP24': {'date': '2024-09-11', 'audience': 'Property Managers'},
        'MSE21': {'date': '2021-03-24', 'audience': 'Education property managers'},
        'NP21': {'date': '2021-11-09', 'audience': 'Property Managers'},
        'NP24': {'date': '2024-11-06', 'audience': 'Property Managers'},
        'SRM22': {'date': '2022-06-15', 'audience': 'Education Managers'},
        'SRM23': {'date': '2023-06-08', 'audience': 'Education Managers'}
    }

In [6]:
def analyze_registrations(merged_csv_path):
    # Read the merged CSV
    df = pd.read_csv(merged_csv_path)
    
    # Convert date_created to datetime if it's not already
    df['Created Date'] = pd.to_datetime(df['Created Date'])
    
    # Group by date and event (Event Code) and count registrations
    registration_counts = df.groupby([
        df['Created Date'].dt.date,
        'Event Code'
    ]).size().reset_index(name='registration_count')
    
    # Sort by date and event
    registration_counts = registration_counts.sort_values(['Created Date', 'Event Code'])

#-----------------------------------------------------------------------------------
    # Calculate cumulative registrations for each event separately
    registration_counts['cumulative_registrations'] = registration_counts.groupby('Event Code')['registration_count'].cumsum()
    

    # Add event dates and target audience
    event_dates = create_event_dates_dict()
    
    # Add event date and target audience columns
    registration_counts['Event date'] = registration_counts['Event Code'].map(
        {k: pd.to_datetime(v['date']) for k, v in event_dates.items()}
    )

    # Remove registrations after event date
    registration_counts = registration_counts[registration_counts['Created Date'] <= registration_counts['Event date']]
    
    registration_counts['Target audience'] = registration_counts['Event Code'].map(
        {k: v['audience'] for k, v in event_dates.items()}
    )

    # Calculate days until event
    registration_counts['Days until event'] = (
        registration_counts['Event date'] - pd.to_datetime(registration_counts['Created Date'])
    ).dt.days


    # Sort by date and event for final output
    registration_counts = registration_counts.sort_values(['Created Date', 'Event Code'])
#--------------------------------------------------------------------------------------


    # Detect promotional spikes
    mean_daily = registration_counts['registration_count'].mean()
    std_daily = registration_counts['registration_count'].std()
    registration_counts['promotional_spike'] = (registration_counts['registration_count'] > 
                                        (mean_daily + 2 * std_daily)).astype(int)
    
    # ---------------------------------------------------------------------------------------

    
    # Export the analysis
    analysis_path = os.path.join(os.path.dirname("/workspaces/collaborative_app_dev/Data/Cleaned Data/"), 'complete_registration_analysis.csv')
    registration_counts.to_csv(analysis_path, index=False)
    
    return analysis_path

In [7]:
analysis_path = analyze_registrations("/workspaces/collaborative_app_dev/Data/Cleaned Data/merged_data.csv")

df = pd.read_csv(analysis_path)
df.head(10)

  df['Created Date'] = pd.to_datetime(df['Created Date'])


Unnamed: 0,Created Date,Event Code,registration_count,cumulative_registrations,Event date,Target audience,Days until event,promotional_spike
0,2019-07-16,D19,125,125,2019-11-19,IT Managers,126,1
1,2019-07-30,D19,118,243,2019-11-19,IT Managers,112,1
2,2019-07-31,D19,19,262,2019-11-19,IT Managers,111,0
3,2019-08-01,D19,2,264,2019-11-19,IT Managers,110,0
4,2019-08-02,D19,1,265,2019-11-19,IT Managers,109,0
5,2019-08-05,D19,5,270,2019-11-19,IT Managers,106,0
6,2019-08-06,D19,2,272,2019-11-19,IT Managers,105,0
7,2019-08-07,D19,3,275,2019-11-19,IT Managers,104,0
8,2019-08-08,D19,5,280,2019-11-19,IT Managers,103,0
9,2019-08-09,D19,19,299,2019-11-19,IT Managers,102,0


### FEATURE ENGINEERING AND EXTRACTION

In [8]:
data = pd.read_csv("/workspaces/collaborative_app_dev/Data/Cleaned Data/complete_registration_analysis.csv")

In [9]:
data.head()
#len(data)

Unnamed: 0,Created Date,Event Code,registration_count,cumulative_registrations,Event date,Target audience,Days until event,promotional_spike
0,2019-07-16,D19,125,125,2019-11-19,IT Managers,126,1
1,2019-07-30,D19,118,243,2019-11-19,IT Managers,112,1
2,2019-07-31,D19,19,262,2019-11-19,IT Managers,111,0
3,2019-08-01,D19,2,264,2019-11-19,IT Managers,110,0
4,2019-08-02,D19,1,265,2019-11-19,IT Managers,109,0


In [10]:
current_date = date.today()
print(type(pd.to_datetime(current_date)))


diff = (pd.to_datetime(current_date) - pd.to_datetime(data['Created Date'][34])).days
print(diff)

days = (pd.to_datetime(data['Event date'][1]) - pd.to_datetime(current_date)).days
print(days)

<class 'pandas._libs.tslibs.timestamps.Timestamp'>
1965
-1912


In [11]:
def extract_train_features(df):
    """Extract features with enhanced pattern detection"""
    features = []
    targets = []

    for event_code in df['Event Code'].unique():
        event_df = df[df['Event Code'] == event_code].copy()
        
        registrations_at_point = event_df
        
        if len(registrations_at_point) == 0:
            continue

        # Enhanced feature extraction
        recent_registrations = registrations_at_point.tail(7)['registration_count'].mean()
        early_registrations = registrations_at_point.head(7)['registration_count'].mean()

        #--------------------------------------------------------------------------------------------
        # Calculate days since last spike
        spike_mask = registrations_at_point['promotional_spike'] == 1
        if any(spike_mask):
            # Get the most recent spike's days until event
            last_spike_days = registrations_at_point[spike_mask]['Days until event'].max()
            # Calculate the difference between current point and last spike
            days_since_spike = registrations_at_point['Days until event'].max() - last_spike_days
        else:
            # If no spikes, use 0 to indicate no prior spikes have occurred
            days_since_spike = 0
        #---------------------------------------------------------------------------------------------


        feature_dict = {
            'current_registrations': registrations_at_point['cumulative_registrations'].max(),
            'avg_daily_rate': registrations_at_point['registration_count'].mean(),
            'recent_velocity': recent_registrations,
            'early_velocity': early_registrations,
            'registration_acceleration': (recent_registrations - early_registrations) / 7,
            'days_active': len(registrations_at_point),
            'peak_daily_registrations': registrations_at_point['registration_count'].max(),
            'registration_volatility': registrations_at_point['registration_count'].std(),
            #-----------------------------------------------------------------------------
            'spike_count': registrations_at_point['promotional_spike'].sum(),
            'days_since_last_spike': days_since_spike,
            'event_code': registrations_at_point['Event Code'].unique()[0],
            'target_audience': registrations_at_point['Target audience'].unique()[0],
            #'registration_start_date': pd.to_datetime(registrations_at_point['Created Date'].unique()[0]),
            #'event_date': pd.to_datetime(registrations_at_point['Event date'].unique()[0]),
            #---------------------------------------------------------------
            'reg_start_day': pd.to_datetime(registrations_at_point['Created Date'].unique()[0]).day,
            'reg_start_month': pd.to_datetime(registrations_at_point['Created Date'].unique()[0]).month,
            'reg_start_year': pd.to_datetime(registrations_at_point['Created Date'].unique()[0]).year,
            'reg_start_weekday': pd.to_datetime(registrations_at_point['Created Date'].unique()[0]).weekday(),
            'reg_start_week_of_the_year': pd.to_datetime(registrations_at_point['Created Date'].unique()[0]).week,
            'event_day': pd.to_datetime(registrations_at_point['Event date'].unique()[0]).day,
            'event_month': pd.to_datetime(registrations_at_point['Event date'].unique()[0]).month,
            'event_year': pd.to_datetime(registrations_at_point['Event date'].unique()[0]).year,
            'event_weekday': pd.to_datetime(registrations_at_point['Event date'].unique()[0]).weekday(),
            'event_week_of_the_year': pd.to_datetime(registrations_at_point['Event date'].unique()[0]).week,
            'duration': (pd.to_datetime(registrations_at_point['Event date'].unique()[0]) - pd.to_datetime(registrations_at_point['Created Date'].unique()[0])).days
            }


        features.append(feature_dict)
        targets.append(event_df['cumulative_registrations'].max())

    return pd.DataFrame(features), np.array(targets)


In [12]:
features, target = extract_train_features(df=data)
features.head(10)

Unnamed: 0,current_registrations,avg_daily_rate,recent_velocity,early_velocity,registration_acceleration,days_active,peak_daily_registrations,registration_volatility,spike_count,days_since_last_spike,...,reg_start_month,reg_start_year,reg_start_weekday,reg_start_week_of_the_year,event_day,event_month,event_year,event_weekday,event_week_of_the_year,duration
0,1172,16.507042,49.714286,38.857143,1.55102,71,125,24.387862,8,0,...,7,2019,1,29,19,11,2019,1,47,126
1,798,9.614458,15.285714,14.714286,0.081633,83,72,12.679762,2,258,...,2,2020,3,6,22,4,2021,3,16,441
2,395,4.593023,7.714286,11.571429,-0.55102,86,53,7.286374,1,8,...,9,2020,2,36,9,11,2021,1,45,433
3,669,7.77907,25.428571,1.142857,3.469388,86,68,10.29266,1,399,...,11,2020,0,45,9,12,2021,3,49,402
4,1593,28.963636,23.714286,19.857143,0.55102,55,135,32.753831,11,0,...,1,2021,1,3,24,3,2021,2,12,64
5,940,12.876712,14.571429,22.857143,-1.183673,73,157,23.122827,5,0,...,3,2022,2,11,15,6,2022,2,24,91
6,699,9.708333,8.142857,18.714286,-1.510204,72,92,15.805541,3,31,...,1,2023,0,5,8,6,2023,3,23,129
7,391,6.015385,13.285714,3.285714,1.428571,65,22,5.260384,0,0,...,1,2024,4,1,3,10,2024,3,40,272
8,317,4.731343,4.857143,11.285714,-0.918367,67,36,6.2367,0,0,...,4,2024,0,16,11,9,2024,2,37,149
9,311,4.936508,5.285714,5.571429,-0.040816,63,18,4.219284,0,0,...,4,2024,3,17,6,11,2024,2,45,195


In [13]:
print(target)

[1172  798  395  669 1593  940  699  391  317  311]


### TRAINING

#### LABEL ENCODING FOR CATEGORICAL COLUMNS

In [14]:
# LABEL ENCODER
le = LabelEncoder()

#Categorical Features: event_code, target_audience
# features['event_code'] = le.fit_transform(features['event_code'])
# features['target_audience'] = le.fit_transform(features['target_audience'])

features = pd.get_dummies(data=features, columns=['event_code', 'target_audience'], dtype=int)


In [15]:
features.head(10)

Unnamed: 0,current_registrations,avg_daily_rate,recent_velocity,early_velocity,registration_acceleration,days_active,peak_daily_registrations,registration_volatility,spike_count,days_since_last_spike,...,event_code_GP24,event_code_MSE21,event_code_NP21,event_code_NP24,event_code_SRM22,event_code_SRM23,target_audience_Education Managers,target_audience_Education property managers,target_audience_IT Managers,target_audience_Property Managers
0,1172,16.507042,49.714286,38.857143,1.55102,71,125,24.387862,8,0,...,0,0,0,0,0,0,0,0,1,0
1,798,9.614458,15.285714,14.714286,0.081633,83,72,12.679762,2,258,...,0,0,0,0,0,0,0,0,0,1
2,395,4.593023,7.714286,11.571429,-0.55102,86,53,7.286374,1,8,...,0,0,1,0,0,0,0,0,0,1
3,669,7.77907,25.428571,1.142857,3.469388,86,68,10.29266,1,399,...,0,0,0,0,0,0,0,0,1,0
4,1593,28.963636,23.714286,19.857143,0.55102,55,135,32.753831,11,0,...,0,1,0,0,0,0,0,1,0,0
5,940,12.876712,14.571429,22.857143,-1.183673,73,157,23.122827,5,0,...,0,0,0,0,1,0,1,0,0,0
6,699,9.708333,8.142857,18.714286,-1.510204,72,92,15.805541,3,31,...,0,0,0,0,0,1,1,0,0,0
7,391,6.015385,13.285714,3.285714,1.428571,65,22,5.260384,0,0,...,0,0,0,0,0,0,0,0,1,0
8,317,4.731343,4.857143,11.285714,-0.918367,67,36,6.2367,0,0,...,1,0,0,0,0,0,0,0,0,1
9,311,4.936508,5.285714,5.571429,-0.040816,63,18,4.219284,0,0,...,0,0,0,1,0,0,0,0,0,1


In [16]:
'''
X = data.drop('charges', axis=1)
y = data.loc[:, 'charges']
'''

"\nX = data.drop('charges', axis=1)\ny = data.loc[:, 'charges']\n"

In [17]:
features = features.drop('current_registrations', axis=1)

In [18]:
features.head(10)

Unnamed: 0,avg_daily_rate,recent_velocity,early_velocity,registration_acceleration,days_active,peak_daily_registrations,registration_volatility,spike_count,days_since_last_spike,reg_start_day,...,event_code_GP24,event_code_MSE21,event_code_NP21,event_code_NP24,event_code_SRM22,event_code_SRM23,target_audience_Education Managers,target_audience_Education property managers,target_audience_IT Managers,target_audience_Property Managers
0,16.507042,49.714286,38.857143,1.55102,71,125,24.387862,8,0,16,...,0,0,0,0,0,0,0,0,1,0
1,9.614458,15.285714,14.714286,0.081633,83,72,12.679762,2,258,6,...,0,0,0,0,0,0,0,0,0,1
2,4.593023,7.714286,11.571429,-0.55102,86,53,7.286374,1,8,2,...,0,0,1,0,0,0,0,0,0,1
3,7.77907,25.428571,1.142857,3.469388,86,68,10.29266,1,399,2,...,0,0,0,0,0,0,0,0,1,0
4,28.963636,23.714286,19.857143,0.55102,55,135,32.753831,11,0,19,...,0,1,0,0,0,0,0,1,0,0
5,12.876712,14.571429,22.857143,-1.183673,73,157,23.122827,5,0,16,...,0,0,0,0,1,0,1,0,0,0
6,9.708333,8.142857,18.714286,-1.510204,72,92,15.805541,3,31,30,...,0,0,0,0,0,1,1,0,0,0
7,6.015385,13.285714,3.285714,1.428571,65,22,5.260384,0,0,5,...,0,0,0,0,0,0,0,0,1,0
8,4.731343,4.857143,11.285714,-0.918367,67,36,6.2367,0,0,15,...,1,0,0,0,0,0,0,0,0,1
9,4.936508,5.285714,5.571429,-0.040816,63,18,4.219284,0,0,25,...,0,0,0,1,0,0,0,0,0,1


In [19]:
print(target)

[1172  798  395  669 1593  940  699  391  317  311]


#### Splitting the data into training and test datasets.

In [20]:
features_train, features_val, target_train, target_val = train_test_split(features, target, test_size=0.2)

In [21]:
features_val.head(10)

Unnamed: 0,avg_daily_rate,recent_velocity,early_velocity,registration_acceleration,days_active,peak_daily_registrations,registration_volatility,spike_count,days_since_last_spike,reg_start_day,...,event_code_GP24,event_code_MSE21,event_code_NP21,event_code_NP24,event_code_SRM22,event_code_SRM23,target_audience_Education Managers,target_audience_Education property managers,target_audience_IT Managers,target_audience_Property Managers
9,4.936508,5.285714,5.571429,-0.040816,63,18,4.219284,0,0,25,...,0,0,0,1,0,0,0,0,0,1
1,9.614458,15.285714,14.714286,0.081633,83,72,12.679762,2,258,6,...,0,0,0,0,0,0,0,0,0,1


#### Standard Scaling

In [22]:
scaler = StandardScaler()
features_train_scaled = scaler.fit_transform(features_train)
features_val_scaled = scaler.transform(features_val)

#### Model Selection

In [23]:
linear_reg_model = LinearRegression()
forest_model = RandomForestRegressor()
boost_model = GradientBoostingRegressor()

linear_reg_model.fit(features_train_scaled, target_train)
forest_model.fit(features_train_scaled, target_train)
boost_model.fit(features_train_scaled, target_train)

#### Choosing the Best Performing Model using Cross validation

In [27]:
models_scores = ['Linear Regression', 'Random Forest', 'Gradient Boosting']
models = [linear_reg_model, forest_model, boost_model]

index = 0

# Leave-One-Out Cross-Validation (LOOCV)
cv_l = LeaveOneOut()
cv = RepeatedKFold(n_splits=3, n_repeats=6, random_state=1)

for model in models_scores:
    score = cross_val_score(models[index], features_train_scaled, target_train,
                                           scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
    score = np.absolute(score)
    print(f"{model} Regressor Model Mean MAE: {score.mean():.3f}.")
    index += 1

Linear Regression Regressor Model Mean MAE: 199.496.
Random Forest Regressor Model Mean MAE: 280.756.
Gradient Boosting Regressor Model Mean MAE: 238.858.


#### Validation

In [28]:
linear_reg_model_pred = linear_reg_model.predict(features_val_scaled)
forest_model_pred = forest_model.predict(features_val_scaled)
boost_model_pred = boost_model.predict(features_val_scaled)

In [29]:
n = 0
predictions = [linear_reg_model_pred, forest_model_pred, boost_model_pred]

for model in models_scores:
    rmse = np.sqrt(mean_squared_error(target_val, predictions[n]))
    r2score = models[n].score(features_val_scaled, target_val)
    print(f"\n{model}:\nrmse = {rmse}\nr2_score = {r2score}")
    
    n += 1


Linear Regression:
rmse = 25.541171162861286
r2_score = 0.988997694903287

Random Forest:
rmse = 115.0544707953585
r2_score = 0.7767409526540145

Gradient Boosting:
rmse = 39.623283442553515
r2_score = 0.9735209139344697


**1. Linear Regression:**
MAE: 186.558 (lower is better)
RMSE: 115.23 (lower is better)
R²: 0.86 (higher is better)

**2. Random Forest Regressor:**
MAE: 266.479 (higher than Linear Regression)
RMSE: 100.19 (lower than Linear Regression)
R²: 0.90 (higher than Linear Regression)

**3. Gradient Boosting Regressor:**
MAE: 224.400 (better than Random Forest, worse than Linear Regression)
RMSE: 151.61 (highest, worse performance)
R²: 0.76 (lowest)
<br>

**Evaluation Summary:**
- R² score is a key metric for model fit, and Random Forest Regressor has the highest R² (0.90), meaning it explains the most variance in the data.

- MAE is also important as it measures how far off the model's predictions are in absolute terms. Here, Linear Regression has the lowest MAE, but the difference is not huge.

- RMSE reflects the model's error in terms of squared differences, and Random Forest Regressor has the lowest RMSE, indicating better precision in its predictions.

**Conclusion:**<br>
Random Forest Regressor appears to be the best model overall because it strikes a good balance between MAE, RMSE, and R². It has the best R² (showing it explains the most variance) and lowest RMSE, even though its MAE is slightly worse than Linear Regression's.

Linear Regression does well in terms of MAE, but its R² is lower than Random Forest, and it does not perform as well on RMSE.

Gradient Boosting performs worst overall, especially with its highest RMSE and lowest R².

### Leave-One-Out Cross-Validation (LOOCV)

***
<p>The dataset only contains 9 rows, it's a small sample size, and a typical train-test split like 70-30 or 80-20 might not work well due to limited data. In such cases, a more suitable approach would be Leave-One-Out Cross-Validation (LOOCV), which uses each row as a test case and the remaining rows for training. This helps to maximize the use of the data for both training and testing.<p>

```python
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
import pandas as pd

# Load your dataset
data = pd.read_csv("your_dataset.csv")  # Replace with your dataset path

# Define your features (X) and target (y)
X = data.drop("target_column_name", axis=1)  # Replace target_column_name with the name of your target column
y = data["target_column_name"]  # Replace with your actual target column

# Initialize the LOOCV and models
loocv = LeaveOneOut()
rf_model = RandomForestRegressor(random_state=1)
gb_model = GradientBoostingRegressor(random_state=1)

# Variables to store performance metrics
rf_mae = []
rf_rmse = []
rf_r2 = []

gb_mae = []
gb_rmse = []
gb_r2 = []

# Loop over each train-test split
for train_idx, test_idx in loocv.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Train Random Forest model
    rf_model.fit(X_train, y_train)
    rf_pred = rf_model.predict(X_test)
    rf_mae.append(mean_absolute_error(y_test, rf_pred))
    rf_rmse.append(np.sqrt(mean_squared_error(y_test, rf_pred)))
    rf_r2.append(r2_score(y_test, rf_pred))

    # Train Gradient Boosting model
    gb_model.fit(X_train, y_train)
    gb_pred = gb_model.predict(X_test)
    gb_mae.append(mean_absolute_error(y_test, gb_pred))
    gb_rmse.append(np.sqrt(mean_squared_error(y_test, gb_pred)))
    gb_r2.append(r2_score(y_test, gb_pred))

# Calculate average metrics for each model
print("Random Forest Regressor:")
print(f"Mean MAE: {np.mean(rf_mae)}")
print(f"Mean RMSE: {np.mean(rf_rmse)}")
print(f"Mean R2: {np.mean(rf_r2)}")

print("\nGradient Boosting Regressor:")
print(f"Mean MAE: {np.mean(gb_mae)}")
print(f"Mean RMSE: {np.mean(gb_rmse)}")
print(f"Mean R2: {np.mean(gb_r2)}")
```





### PREDICTION

#### EXTRACT FEATURES FOR PREDICTION, NEEDS DATE AS INPUT

In [43]:
def extract_features(df, event_date, date_at_point):
    """Extract features with enhanced pattern detection"""

    #days = (pd.to_datetime(df['Event date'][0]) - pd.to_datetime(date_at_point)).days
    days = (pd.to_datetime(event_date) - pd.to_datetime(date_at_point)).days
    features = []

    if days < 0:
        print("Enter a date on or before the Event date")

    else:
        for event_code in df['Event Code'].unique():
            event_df = df[df['Event Code'] == event_code].copy()
            
            if event_df['Days until event'].max() < days:
                continue
            
            registrations_at_point = event_df[event_df['Days until event'] >= days]
            
            if len(registrations_at_point) == 0:
                continue

            # Enhanced feature extraction
            recent_registrations = registrations_at_point.tail(7)['registration_count'].mean()
            early_registrations = registrations_at_point.head(7)['registration_count'].mean()

            #--------------------------------------------------------------------------------------------
            # Calculate days since last spike
            spike_mask = registrations_at_point['promotional_spike'] == 1
            if any(spike_mask):
                # Get the most recent spike's days until event
                last_spike_days = registrations_at_point[spike_mask]['Days until event'].max()
                # Calculate the difference between current point and last spike
                days_since_spike = registrations_at_point['Days until event'].max() - last_spike_days
            else:
                # If no spikes, use 0 to indicate no prior spikes have occurred
                days_since_spike = 0
            #---------------------------------------------------------------------------------------------


            feature_dict = {
                'current_registrations': registrations_at_point['cumulative_registrations'].max(),
                'avg_daily_rate': registrations_at_point['registration_count'].mean(),
                'recent_velocity': recent_registrations,
                'early_velocity': early_registrations,
                'registration_acceleration': (recent_registrations - early_registrations) / 7,
                'days_active': len(registrations_at_point),
                'peak_daily_registrations': registrations_at_point['registration_count'].max(),
                'registration_volatility': registrations_at_point['registration_count'].std(),
                'spike_count': registrations_at_point['promotional_spike'].sum(),
                'days_since_last_spike': days_since_spike,
                'event_code': registrations_at_point['Event Code'].unique()[0],
                'target_audience': registrations_at_point['Target audience'].unique()[0],
                'registration_start_date': registrations_at_point['Created Date'].unique()[0],
                'event_date': registrations_at_point['Event date'].unique()[0]
                }


            features.append(feature_dict)

        return pd.DataFrame(features)
