In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tqdm.auto import tqdm

In [3]:
df = pd.read_csv('before_ts.csv')
df['date'] = pd.to_datetime(df['date']).dt.to_period('M').dt.to_timestamp()
df = df.drop_duplicates(subset=['company', 'date'])
df[df.duplicated(['company', 'date'])]

Unnamed: 0,key,company,date,industry,stage,country,event_type,quarter,is_vc_backed,is_profitable,...,macroeconomic_flag,industry_downturn_flag,geopolitical_flag,regulatory_pressure_flag,total_workforce_est,final_count,final_percentage,funds_raised,quater,reason_category


In [4]:
df.head()

Unnamed: 0,key,company,date,industry,stage,country,event_type,quarter,is_vc_backed,is_profitable,...,macroeconomic_flag,industry_downturn_flag,geopolitical_flag,regulatory_pressure_flag,total_workforce_est,final_count,final_percentage,funds_raised,quater,reason_category
0,#Paid2023/01/27,#Paid,2023-01-01,Marketing,Series B,Canada,Layoff,Q1-2023,1.0,0.0,...,1.0,0.0,0.0,0.0,,19.0,0.17,3.044523,Q1,Economic Conditions
1,&Open2022/11/17,&Open,2022-11-01,Marketing,Series A,Ireland,Layoff,Q4-2022,1.0,0.0,...,0.0,0.0,0.0,0.0,,9.0,0.09,3.555348,Q4,Cost Cutting
2,100 Thieves2022/07/13,100 Thieves,2022-07-01,Consumer,Series C,United States,Layoff,Q3-2022,1.0,0.0,...,0.0,0.0,0.0,0.0,,13.5,0.0,4.787492,Q3,Restructuring
3,100 Thieves2023/01/10,100 Thieves,2023-01-01,Retail,Series C,United States,Layoff,Q1-2023,1.0,0.0,...,0.0,1.0,0.0,0.0,200.0,15.0,0.075,4.787492,Q1,Financial Distress
4,10X Genomics2022/08/04,10X Genomics,2022-08-01,Healthcare,Post-IPO,United States,Layoff,Q3-2022,0.0,0.0,...,1.0,0.0,0.0,0.0,1250.0,100.0,0.08,5.488938,Q3,Economic Conditions


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4192 entries, 0 to 4231
Data columns (total 49 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   key                           4192 non-null   object        
 1   company                       4192 non-null   object        
 2   date                          4192 non-null   datetime64[ns]
 3   industry                      4190 non-null   object        
 4   stage                         4187 non-null   object        
 5   country                       4190 non-null   object        
 6   event_type                    4113 non-null   object        
 7   quarter                       4113 non-null   object        
 8   is_vc_backed                  4106 non-null   float64       
 9   is_profitable                 4050 non-null   float64       
 10  is_ai_pivot                   4113 non-null   float64       
 11  business_model                4107 

In [6]:
companies = df['company'].unique()

full_dates = pd.date_range(start=df['date'].min(), end=df['date'].max(), freq='MS')

full_index = pd.MultiIndex.from_product([companies, full_dates], names=['company', 'date'])

full_index_df = (full_index.to_frame(index=False).rename(columns={0: 'company', 1: 'date'}))
full_index_df['date'] = pd.to_datetime(full_index_df['date'])

In [7]:
full_dates

DatetimeIndex(['2020-03-01', '2020-04-01', '2020-05-01', '2020-06-01',
               '2020-07-01', '2020-08-01', '2020-09-01', '2020-10-01',
               '2020-11-01', '2020-12-01', '2021-01-01', '2021-02-01',
               '2021-03-01', '2021-04-01', '2021-05-01', '2021-06-01',
               '2021-07-01', '2021-08-01', '2021-09-01', '2021-10-01',
               '2021-11-01', '2021-12-01', '2022-01-01', '2022-02-01',
               '2022-03-01', '2022-04-01', '2022-05-01', '2022-06-01',
               '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01',
               '2022-11-01', '2022-12-01', '2023-01-01', '2023-02-01',
               '2023-03-01', '2023-04-01', '2023-05-01', '2023-06-01',
               '2023-07-01', '2023-08-01', '2023-09-01', '2023-10-01',
               '2023-11-01', '2023-12-01', '2024-01-01', '2024-02-01',
               '2024-03-01', '2024-04-01', '2024-05-01', '2024-06-01',
               '2024-07-01', '2024-08-01', '2024-09-01', '2024-10-01',
      

In [8]:
len(companies)*len(full_dates)

199500

In [9]:
len(full_index)

199500

In [10]:
full_index_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199500 entries, 0 to 199499
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   company  199500 non-null  object        
 1   date     199500 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 3.0+ MB


In [11]:
company_static_cols = [
    'industry', 'stage', 'country', 'is_vc_backed', 'is_profitable', 'is_ai_pivot', 'business_model', 'total_workforce_est', 'funds_raised']


event_flag_cols = [ 
    # temporal / narrative orientation 
    'forward_looking_flag', 'backward_looking_flag', 'timing_unclear_flag', 'imminent_layoff_flag', 'long_horizon_flag', 
    'uncertainty_flag', 'confirmed_flag',
    # corporate actions 
    'acquisition_flag', 'private_equity_flag', 'post_acquisition_layoff_flag', 'restructuring_flag', 'cost_cutting_flag', 'strategy_shift_flag',
    # operational / strategic causes 
    'automation_ai_flag', 'product_exit_flag', 'market_exit_flag', 'offshoring_flag', 'management_change_flag',
    # workforce dynamics 
    'employee_unrest_flag', 'attrition_flag', 'repeat_layoff_flag', 'department_specific_flag', 'senior_role_impact_flag', 'junior_role_impact_flag',
    # financial stress 
    'revenue_decline_flag', 'profitability_pressure_flag', 'runway_issue_flag',
    # macro / external 
    'macroeconomic_flag', 'industry_downturn_flag', 'geopolitical_flag', 'regulatory_pressure_flag'
]

event_numeric_cols = ['final_count', 'final_percentage']

event_metadata_cols = ['event_type', 'quater', 'reason_category']


In [12]:
company_snapshot = (
    df
    .sort_values('date')
    .groupby('company', as_index=False)
    [company_static_cols]
    .last()
)

base_df = full_index_df.copy()

base_df = base_df.merge(
    company_snapshot,
    on='company',
    how='left'
)

In [13]:
base_df

Unnamed: 0,company,date,industry,stage,country,is_vc_backed,is_profitable,is_ai_pivot,business_model,total_workforce_est,funds_raised
0,#Paid,2020-03-01,Marketing,Series B,Canada,1.0,0.0,0.0,B2B,,3.044523
1,#Paid,2020-04-01,Marketing,Series B,Canada,1.0,0.0,0.0,B2B,,3.044523
2,#Paid,2020-05-01,Marketing,Series B,Canada,1.0,0.0,0.0,B2B,,3.044523
3,#Paid,2020-06-01,Marketing,Series B,Canada,1.0,0.0,0.0,B2B,,3.044523
4,#Paid,2020-07-01,Marketing,Series B,Canada,1.0,0.0,0.0,B2B,,3.044523
...,...,...,...,...,...,...,...,...,...,...,...
199495,xAI,2025-08-01,AI,Unknown,United States,1.0,0.0,1.0,B2B,1500.0,10.030120
199496,xAI,2025-09-01,AI,Unknown,United States,1.0,0.0,1.0,B2B,1500.0,10.030120
199497,xAI,2025-10-01,AI,Unknown,United States,1.0,0.0,1.0,B2B,1500.0,10.030120
199498,xAI,2025-11-01,AI,Unknown,United States,1.0,0.0,1.0,B2B,1500.0,10.030120


In [14]:
event_cols = (['company', 'date'] + event_flag_cols + event_numeric_cols + event_metadata_cols)

event_df = df[event_cols]

base_df = base_df.merge(event_df, on=['company', 'date'], how='left')
#base_df[event_flag_cols] = base_df[event_flag_cols].astype('float')

In [15]:
base_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199500 entries, 0 to 199499
Data columns (total 47 columns):
 #   Column                        Non-Null Count   Dtype         
---  ------                        --------------   -----         
 0   company                       199500 non-null  object        
 1   date                          199500 non-null  datetime64[ns]
 2   industry                      199360 non-null  object        
 3   stage                         199220 non-null  object        
 4   country                       199360 non-null  object        
 5   is_vc_backed                  195090 non-null  float64       
 6   is_profitable                 192290 non-null  float64       
 7   is_ai_pivot                   195510 non-null  float64       
 8   business_model                195160 non-null  object        
 9   total_workforce_est           122990 non-null  float64       
 10  funds_raised                  199500 non-null  float64       
 11  forward_looki

In [16]:
base_df

Unnamed: 0,company,date,industry,stage,country,is_vc_backed,is_profitable,is_ai_pivot,business_model,total_workforce_est,...,runway_issue_flag,macroeconomic_flag,industry_downturn_flag,geopolitical_flag,regulatory_pressure_flag,final_count,final_percentage,event_type,quater,reason_category
0,#Paid,2020-03-01,Marketing,Series B,Canada,1.0,0.0,0.0,B2B,,...,,,,,,,,,,
1,#Paid,2020-04-01,Marketing,Series B,Canada,1.0,0.0,0.0,B2B,,...,,,,,,,,,,
2,#Paid,2020-05-01,Marketing,Series B,Canada,1.0,0.0,0.0,B2B,,...,,,,,,,,,,
3,#Paid,2020-06-01,Marketing,Series B,Canada,1.0,0.0,0.0,B2B,,...,,,,,,,,,,
4,#Paid,2020-07-01,Marketing,Series B,Canada,1.0,0.0,0.0,B2B,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199495,xAI,2025-08-01,AI,Unknown,United States,1.0,0.0,1.0,B2B,1500.0,...,,,,,,,,,,
199496,xAI,2025-09-01,AI,Unknown,United States,1.0,0.0,1.0,B2B,1500.0,...,0.0,0.0,0.0,0.0,0.0,500.0,0.165,Layoff,Q3,AI Pivot
199497,xAI,2025-10-01,AI,Unknown,United States,1.0,0.0,1.0,B2B,1500.0,...,,,,,,,,,,
199498,xAI,2025-11-01,AI,Unknown,United States,1.0,0.0,1.0,B2B,1500.0,...,,,,,,,,,,


In [17]:
# print(len(base_df), len(full_index))
# should match full_index row count
assert len(base_df) == len(full_index)

# flags should be sparse
base_df[event_flag_cols].count().sum()


127499

In [18]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

# --- 1. CONFIGURATION ---
HALF_LIFE = {
    # ... your dictionary here ...
    'forward_looking_flag': 3,
    'backward_looking_flag': 1,
    'timing_unclear_flag': 2,
    'imminent_layoff_flag': 1,
    'long_horizon_flag': 6,
    'uncertainty_flag': 2,
    'confirmed_flag': 1,
    'acquisition_flag': 6,
    'private_equity_flag': 6,
    'post_acquisition_layoff_flag': 3,
    'restructuring_flag': 4,
    'cost_cutting_flag': 3,
    'strategy_shift_flag': 4,
    'automation_ai_flag': 4,
    'product_exit_flag': 4,
    'market_exit_flag': 4,
    'offshoring_flag': 3,
    'management_change_flag': 3,
    'employee_unrest_flag': 1,
    'attrition_flag': 2,
    'repeat_layoff_flag': 6,
    'department_specific_flag': 2,
    'senior_role_impact_flag': 2,
    'junior_role_impact_flag': 2,
    'revenue_decline_flag': 6,
    'profitability_pressure_flag': 6,
    'runway_issue_flag': 6,
    'macroeconomic_flag': 9,
    'industry_downturn_flag': 9,
    'geopolitical_flag': 9,
    'regulatory_pressure_flag': 9
}

# --- 2. PREPROCESSING ---
# Ensure data is sorted by company and date
base_df = base_df.sort_values(['company', 'date']).reset_index(drop=True)

# CRITICAL FIX: Fill NaNs with 0 to prevent NaN propagation
event_flag_cols = list(HALF_LIFE.keys())
base_df[event_flag_cols] = base_df[event_flag_cols].fillna(0)

# Identify where new companies start (to reset the decay)
# This creates a mask where True = start of a new company
base_df['company_id'] = base_df['company'].astype('category').cat.codes
company_change_mask = base_df['company_id'].diff() != 0
company_change_indices = np.where(company_change_mask)[0]

# --- 3. OPTIMIZED NUMPY FUNCTION ---
def calculate_decay_numpy(values, alpha, reset_indices):
    """
    Vectorized calculation of decay using a linear scan in NumPy.
    values: 1D numpy array of the flag column
    alpha: float, decay factor
    reset_indices: indices where a new company starts (reset signal to 0)
    """
    n = len(values)
    signal = np.zeros(n, dtype=np.float64)
    
    # We loop once through the array (O(N)), much faster than dataframe access
    # We use a simple variable to track the running 'prev' value
    current_val = 0.0
    
    # Iterate through the numpy array directly
    for i in range(n):
        # If we hit a new company index, reset the accumulator
        if i in reset_indices:  # Note: checking set membership is faster, see optimization below
            current_val = 0.0
            
        # Signal_t = Flag_t + alpha * Signal_t-1
        current_val = values[i] + (alpha * current_val)
        signal[i] = current_val
        
    return signal

# Optimized runner
signal_cols = []
reset_indices_set = set(company_change_indices) # Set is O(1) lookup

for flag in tqdm(event_flag_cols):
    if flag not in base_df.columns:
        continue
        
    signal_name = f'signal_{flag.replace("_flag", "")}'
    
    # Calculate Alpha
    alpha = 0.5 ** (1 / HALF_LIFE[flag])
    
    # Pass numpy array to function
    values = base_df[flag].values
    
    # Compute
    base_df[signal_name] = calculate_decay_numpy(values, alpha, reset_indices_set)
    signal_cols.append(signal_name)

# --- 4. SCALING ---
# Only scale if max > 0 (avoid divide by zero)
if signal_cols:
    scaler = MinMaxScaler()
    base_df[signal_cols] = scaler.fit_transform(base_df[signal_cols])

# Verify results
print("NaN Count:", base_df[signal_cols].isna().sum().sum())
print(base_df[signal_cols].describe())

100%|██████████| 31/31 [00:00<00:00, 50.37it/s]


NaN Count: 0
       signal_forward_looking  signal_backward_looking  signal_timing_unclear  \
count           199500.000000             1.995000e+05          199500.000000   
mean                 0.004033             1.978140e-02               0.000631   
std                  0.033797             8.343483e-02               0.019245   
min                  0.000000             0.000000e+00               0.000000   
25%                  0.000000             0.000000e+00               0.000000   
50%                  0.000000             5.921189e-17               0.000000   
75%                  0.000000             8.138021e-06               0.000000   
max                  1.000000             1.000000e+00               1.000000   

       signal_imminent_layoff  signal_long_horizon  signal_uncertainty  \
count           199500.000000        199500.000000       199500.000000   
mean                 0.000963             0.002311            0.000818   
std                  0.022338      

In [19]:
def calculate_material_risk_and_binary(df, 
                                       w1=0.4, w2=0.4, w3=0.2, 
                                       lambda_vc=0.3,
                                       materiality_threshold=0.1):
    """
    1. Calculates 'material_risk_score' (0.0 - 1.0) using your Economic Severity formula.
    2. Creates 'material_layoff_event' (0 or 1) by thresholding that score.
    """
    df = df.copy()
    
    # --- PRE-COMPUTE CONSTANTS ---
    # Global Max Funds (for normalization)
    F_max = df['funds_raised'].max()
    if pd.isna(F_max) or F_max == 0: F_max = 1 

    # Handle missing values for calculation safety
    # If final_count is NaN, it means 0 (no layoff)
    final_count = df['final_count'].fillna(0)
    final_pct = df['final_percentage'].fillna(0)
    workforce = df[['total_workforce_est', 'final_count']].max(axis=1).fillna(1)
    funds = df['funds_raised'].fillna(0)
    is_vc = df['is_vc_backed'].fillna(0).astype(int)

    # --- COMPONENT A: Absolute Scale Impact ---
    # A = log(1 + C) / log(1 + W)
    # Interpretation: Magnitude of cut relative to company size
    comp_A = np.log1p(final_count) / np.log1p(workforce)

    # --- COMPONENT B: Relative Workforce Impact ---
    # R = P / 100
    # Interpretation: % of company let go
    comp_R = final_pct / 100.0

    # --- COMPONENT C: Financial Fragility ---
    # F_risk = 1 - (log(1 + F) / log(1 + F_max))
    # Interpretation: Inverts funding (High funds -> Low risk)
    comp_F_risk = 1 - (np.log1p(funds) / np.log1p(F_max))

    # --- MODIFIER: Capital Buffer ---
    # B = 1 - lambda * V
    # Interpretation: VC backed companies get a risk reduction
    modifier_B = 1 - (lambda_vc * is_vc)

    # --- FINAL FORMULA ---
    # MaterialRisk = B * (w1*A + w2*R + w3*F_risk)
    raw_score = modifier_B * ((w1 * comp_A) + (w2 * comp_R) + (w3 * comp_F_risk))
    
    # MASKING: If no layoff happened (count=0), Severity is 0.
    # (The formula might yield non-zero F_risk even if count is 0, so we must mask)
    is_event = final_count > 0
    df['material_risk_score'] = np.where(is_event, raw_score, 0.0)
    
    # Clip to valid range [0, 1]
    df['material_risk_score'] = df['material_risk_score'].clip(0, 1)

    # --- BINARY RESULT ---
    # Convert the continuous score to the binary flag you requested.
    # A layoff is "Material" if the calculated score exceeds the threshold.
    # threshold=0.0 means ANY layoff is considered material (if that's what you want)
    # threshold=0.1 filters out very minor "noise" layoffs in well-funded companies.
    df['material_layoff_event'] = (df['material_risk_score'] > materiality_threshold).astype(int)

    return df

# --- APPLY ---
# You can adjust 'materiality_threshold' to control sensitivity.
# 0.05 is a good baseline (keeps most layoffs, drops tiny ones in giants).
base_df = calculate_material_risk_and_binary(base_df, materiality_threshold=0.05)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [20]:
# --- CHECK RESULTS ---
cols = ['company', 'date', 'final_count', 'final_percentage', 'funds_raised', 'material_risk_score', 'material_layoff_event']
# Show examples of Material Layoffs (1)
print("Material Events:")
base_df[base_df['material_layoff_event'] == 1][cols].head()

Material Events:


Unnamed: 0,company,date,final_count,final_percentage,funds_raised,material_risk_score,material_layoff_event
34,#Paid,2023-01-01,19.0,0.17,3.044523,0.343531,1
102,&Open,2022-11-01,9.0,0.09,3.555348,0.336757,1
168,100 Thieves,2022-07-01,13.5,0.0,4.787492,0.184511,1
174,100 Thieves,2023-01-01,15.0,0.075,4.787492,0.189918,1
239,10X Genomics,2022-08-01,100.0,0.08,5.488938,0.312062,1


In [21]:
# Show examples of layoffs that were NOT material (0) - if any exist
print("\nMinor (Non-Material) Layoffs:")
base_df[(base_df['final_count'] > 0) & (base_df['material_layoff_event'] == 0)][cols].head()


Minor (Non-Material) Layoffs:


Unnamed: 0,company,date,final_count,final_percentage,funds_raised,material_risk_score,material_layoff_event
321,123Milhas,2023-08-01,42.0,0.0,-11.512925,,0
387,1K Kirana,2023-04-01,400.0,0.4,-11.512925,,0
1166,888,2024-01-01,100.0,0.0,-11.512925,,0
1806,AMD,2024-11-01,1000.0,0.04,-11.512925,,0
2014,Aakash,2024-09-01,90.0,0.0,-11.512925,,0


In [22]:
base_df.material_layoff_event.value_counts()

material_layoff_event
0    196572
1      2928
Name: count, dtype: int64

In [23]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def generate_advanced_features(df):
    """
    Generates Company-Level History, Industry/Geo Context, and Momentum features.
    Assumes df has: 'company', 'date', 'industry', 'country', 
                    'material_layoff_event' (0/1), 'material_risk_score' (0.0-1.0)
    """
    df = df.sort_values(['company', 'date']).reset_index(drop=True)
    
    # ==========================================
    # 1. COMPANY-LEVEL ROLLING HISTORY
    # ==========================================
    print("Generating Company History...")
    
    # A. Layoff Counts (Frequency)
    # We use a loop for different windows to keep code clean
    # closed='left' isn't strictly needed if we are careful about target shifting later,
    # but strictly speaking, features at Time T include events at Time T.
    for window in [3, 6, 12]:
        col_name = f'layoff_count_{window}m'
        # Group by company, rolling sum of the binary event
        df[col_name] = df.groupby('company')['material_layoff_event'] \
                         .transform(lambda x: x.rolling(window, min_periods=1).sum()) \
                         .fillna(0)

    # B. Severity History (Intensity)
    # Track the MAXIMUM severity (risk score) seen in recent windows
    for window in [6, 12]:
        col_name = f'max_severity_{window}m'
        df[col_name] = df.groupby('company')['material_risk_score'] \
                         .transform(lambda x: x.rolling(window, min_periods=1).max()) \
                         .fillna(0)

    # C. Months Since Last Event (Recency) - (Optimized from previous step)
    # If we already have it, skip. If not, recalculate.
    if 'months_since_last_layoff' not in df.columns:
        # Vectorized approach using cumsum groups
        df['event_group'] = df.groupby('company')['material_layoff_event'].cumsum()
        # Calculate months by grouping on the event_group (this is an approx heuristic)
        # For exactness, we iterate. Let's stick to the reliable transform method:
        def get_months_since(x):
            last_idx = -999
            res = []
            for i, val in enumerate(x):
                if val == 1: last_idx = i
                res.append(i - last_idx)
            return res
        df['months_since_last_layoff'] = df.groupby('company')['material_layoff_event'].transform(get_months_since)
        df['months_since_last_layoff'] = df['months_since_last_layoff'].clip(upper=24) # Cap at 24

    # ==========================================
    # 2. CONTEXTUAL PRESSURE (Industry & Geo)
    # ==========================================
    print("Generating Contextual Pressure...")
    
    # Helper to calculate group-level stats safely
    def calculate_group_pressure(df, group_col, time_col='date'):
        # 1. Aggregate to Group-Date level
        # We count TOTAL material events and TOTAL active companies
        group_stats = df.groupby([group_col, time_col]).agg(
            total_events=('material_layoff_event', 'sum'),
            active_companies=('company', 'count')
        ).reset_index()
        
        # 2. Sort by date
        group_stats = group_stats.sort_values([group_col, time_col])
        
        # 3. Calculate Rolling Sums on the GROUP level
        # This tells us: "How many layoffs in this industry in last 3 months?"
        for w in [3, 6]:
            # Raw Count
            group_stats[f'{group_col}_layoff_count_{w}m'] = group_stats.groupby(group_col)['total_events'] \
                .transform(lambda x: x.rolling(w, min_periods=1).sum())
            
            # Rate (Normalized by size of industry) - BETTER FEATURE
            # We take the mean active companies over the window to normalize
            avg_companies = group_stats.groupby(group_col)['active_companies'] \
                .transform(lambda x: x.rolling(w, min_periods=1).mean())
            
            group_stats[f'{group_col}_pressure_index_{w}m'] = \
                group_stats[f'{group_col}_layoff_count_{w}m'] / avg_companies.replace(0, 1)

        return group_stats

    # A. Industry Pressure
    industry_stats = calculate_group_pressure(df, 'industry')
    # Merge back to main DF
    df = df.merge(industry_stats[['industry', 'date', 
                                  'industry_layoff_count_3m', 'industry_pressure_index_3m',
                                  'industry_layoff_count_6m', 'industry_pressure_index_6m']], 
                  on=['industry', 'date'], how='left')

    # B. Geo Pressure
    geo_stats = calculate_group_pressure(df, 'country')
    df = df.merge(geo_stats[['country', 'date', 
                             'country_layoff_count_3m', 'country_pressure_index_3m',
                             'country_layoff_count_6m', 'country_pressure_index_6m']], 
                  on=['country', 'date'], how='left')

    # Fill NaNs from merge (if a company has no industry/country)
    new_context_cols = [c for c in df.columns if 'industry_' in c or 'country_' in c]
    df[new_context_cols] = df[new_context_cols].fillna(0)

    # ==========================================
    # 3. MOMENTUM / DELTA (Advanced)
    # ==========================================
    print("Generating Momentum...")
    
    # Calculate how much the "Macro" signal has changed in 3 months
    # Positive = Risk is increasing
    df['macro_signal_delta_3m'] = df.groupby('company')['signal_macroeconomic'].diff(3).fillna(0)
    
    # Industry Pressure Delta (Is the industry crashing NOW vs 3 months ago?)
    # We take the 3m count and subtract the count from 3 months prior
    df['industry_pressure_delta'] = df.groupby('company')['industry_layoff_count_3m'].diff(3).fillna(0)

    return df

# --- EXECUTION ---
# Apply to your processed_df
# Ensure you have 'material_layoff_event' and 'material_risk_score' computed before running this!
advanced_df = generate_advanced_features(base_df)

# --- CHECK ---
print("Features Generated.")
print(advanced_df[['company', 'date', 'layoff_count_6m', 'industry_pressure_index_3m', 'macro_signal_delta_3m']].tail())

Generating Company History...
Generating Contextual Pressure...
Generating Momentum...
Features Generated.
       company       date  layoff_count_6m  industry_pressure_index_3m  \
199495     xAI 2025-08-01              0.0                    0.086957   
199496     xAI 2025-09-01              1.0                    0.173913   
199497     xAI 2025-10-01              1.0                    0.130435   
199498     xAI 2025-11-01              1.0                    0.086957   
199499     xAI 2025-12-01              1.0                    0.000000   

        macro_signal_delta_3m  
199495                    0.0  
199496                    0.0  
199497                    0.0  
199498                    0.0  
199499                    0.0  


In [24]:
advanced_df.columns

Index(['company', 'date', 'industry', 'stage', 'country', 'is_vc_backed',
       'is_profitable', 'is_ai_pivot', 'business_model', 'total_workforce_est',
       'funds_raised', 'forward_looking_flag', 'backward_looking_flag',
       'timing_unclear_flag', 'imminent_layoff_flag', 'long_horizon_flag',
       'uncertainty_flag', 'confirmed_flag', 'acquisition_flag',
       'private_equity_flag', 'post_acquisition_layoff_flag',
       'restructuring_flag', 'cost_cutting_flag', 'strategy_shift_flag',
       'automation_ai_flag', 'product_exit_flag', 'market_exit_flag',
       'offshoring_flag', 'management_change_flag', 'employee_unrest_flag',
       'attrition_flag', 'repeat_layoff_flag', 'department_specific_flag',
       'senior_role_impact_flag', 'junior_role_impact_flag',
       'revenue_decline_flag', 'profitability_pressure_flag',
       'runway_issue_flag', 'macroeconomic_flag', 'industry_downturn_flag',
       'geopolitical_flag', 'regulatory_pressure_flag', 'final_count',
       '

In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# 1. DEFINE YOUR FEATURE LIST
feature_cols = [
    # --- A. DECAYED SIGNALS (Narrative) ---
    'signal_forward_looking', 'signal_backward_looking', 'signal_timing_unclear',
    'signal_imminent_layoff', 'signal_long_horizon', 'signal_uncertainty',
    'signal_confirmed', 'signal_acquisition', 'signal_private_equity',
    'signal_post_acquisition_layoff', 'signal_restructuring', 'signal_cost_cutting',
    'signal_strategy_shift', 'signal_automation_ai', 'signal_product_exit',
    'signal_market_exit', 'signal_offshoring', 'signal_management_change',
    'signal_employee_unrest', 'signal_attrition', 'signal_repeat_layoff',
    'signal_department_specific', 'signal_senior_role_impact',
    'signal_junior_role_impact', 'signal_revenue_decline',
    'signal_profitability_pressure', 'signal_runway_issue',
    'signal_macroeconomic', 'signal_industry_downturn', 'signal_geopolitical',
    'signal_regulatory_pressure',

    # --- B. COMPANY STATE (Financials) ---
    'funds_raised', 'total_workforce_est', 
    'is_vc_backed', 'is_profitable', 'is_ai_pivot',

    # --- C. HISTORY (The "Memory" of past pain) ---
    'layoff_count_6m',          # Frequency
    'months_since_last_layoff', # Recency
    'max_severity_12m',         # Intensity

    # --- D. CONTEXT (The "Contagion") ---
    'industry_pressure_index_3m', # Sector stress
    'country_pressure_index_3m',  # Regional stress
    
    # --- E. MOMENTUM (The "Speed" of change) ---
    'macro_signal_delta_3m'
]

def finalize_dataset(df, features):
    df = df.copy()
    
    # 1. Target Creation (Shift -1 to predict Next Month)
    # We must do this BEFORE any dropping/sorting
    df = df.sort_values(['company', 'date'])
    df['target'] = df.groupby('company')['material_layoff_event'].shift(-1)
    
    # Drop rows where target is NaN (the last month of data for every company)
    df = df.dropna(subset=['target'])
    df['target'] = df['target'].astype(int)

    # 2. Handling Booleans
    # Convert bool columns to 0/1 integers
    bool_cols = ['is_vc_backed', 'is_profitable', 'is_ai_pivot']
    for c in bool_cols:
        if c in df.columns:
            df[c] = df[c].fillna(0).astype(int)

    # 3. Filling NaNs
    # Signals/Counts -> 0
    # Context -> 0 (assume average/no pressure if unknown)
    # Financials -> Median or 0? Let's use 0 for robust LSTM handling (it learns 0 = missing)
    df[features] = df[features].fillna(0)

    # 4. Scaling (CRITICAL FOR LSTM)
    # We fit the scaler on the features
    scaler = MinMaxScaler(feature_range=(0, 1))
    df[features] = scaler.fit_transform(df[features])

    return df, scaler

# --- EXECUTE ---
print(f"Selecting {len(feature_cols)} features...")
final_df, feature_scaler = finalize_dataset(advanced_df, feature_cols)

# --- VALIDATION ---
print(f"\nFinal Dataset Shape: {final_df.shape}")
print(f"Target Distribution (Next Month Layoffs):\n{final_df['target'].value_counts()}")
print("\nFirst 3 rows of features:")
final_df[feature_cols].head(3)

Selecting 42 features...

Final Dataset Shape: (196650, 99)
Target Distribution (Next Month Layoffs):
target
0    193812
1      2838
Name: count, dtype: int64

First 3 rows of features:


Unnamed: 0,signal_forward_looking,signal_backward_looking,signal_timing_unclear,signal_imminent_layoff,signal_long_horizon,signal_uncertainty,signal_confirmed,signal_acquisition,signal_private_equity,signal_post_acquisition_layoff,...,total_workforce_est,is_vc_backed,is_profitable,is_ai_pivot,layoff_count_6m,months_since_last_layoff,max_severity_12m,industry_pressure_index_3m,country_pressure_index_3m,macro_signal_delta_3m
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.118095,0.025424,0.19522
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.438639,0.127119,0.19522
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.522993,0.152542,0.19522


In [26]:
df.date.min(), df.date.max()

(Timestamp('2020-03-01 00:00:00'), Timestamp('2025-12-01 00:00:00'))

In [27]:
import numpy as np
import pandas as pd
from tqdm import tqdm

# --- 1. DEFINE THE SEQUENCE CREATOR FUNCTION ---
def create_sequences(df, seq_length, features, target):
    """
    Converts a flat dataframe into (Samples, TimeSteps, Features) tensors.
    """
    X = []
    y = []
    companies = [] 
    dates = []     
    
    # Group by company to ensure we don't mix data between companies
    # Using tqdm to show progress
    for company_name, group in tqdm(df.groupby('company'), desc="Building Sequences"):
        # Convert group to values
        data_values = group[features].values
        target_values = group[target].values
        date_values = group['date'].values
        
        # We need at least seq_length rows to make 1 sequence
        if len(data_values) < seq_length:
            continue
            
        # Sliding window
        for i in range(len(data_values) - seq_length):
            # Input: T to T+seq_len
            X.append(data_values[i : i + seq_length])
            
            # Target: The target associated with the LAST step of the sequence
            y.append(target_values[i + seq_length - 1])
            
            companies.append(company_name)
            dates.append(date_values[i + seq_length - 1])

    return np.array(X), np.array(y), np.array(companies), np.array(dates)

# --- 2. DEFINE THE SPLIT FUNCTION (NUMPY VERSION) ---
def get_train_test_data_numpy(df, split_date, seq_len, features, target_col):
    split_ts = pd.Timestamp(split_date)
    
    # Filter Dataframes
    train_df = df[df['date'] < split_ts]
    test_df_pool = df[df['date'] <= split_ts]
    
    # Generate Train Sequences
    print(f"Generating TRAIN sequences (Target < {split_date})...")
    X_train, y_train, _, _ = create_sequences(train_df, seq_len, features, target_col)
    
    # Generate Test Sequences (One per company ending on Split Date)
    print(f"Generating TEST sequences (Predicting Jan 2025)...")
    X_test = []
    y_test = []
    test_companies = []
    
    for company, group in tqdm(test_df_pool.groupby('company'), desc="Building Test Set"):
        group = group.sort_values('date')
        
        # We need the last row to be exactly the split_date
        if group.empty or group.iloc[-1]['date'] != split_ts:
            continue
            
        # Check if we have enough history
        if len(group) < seq_len:
            continue
            
        # Grab the last SEQ_LEN rows
        seq = group.iloc[-seq_len:]
        
        X_test.append(seq[features].values)
        y_test.append(seq.iloc[-1][target_col])
        test_companies.append(company)

    return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), test_companies

# --- 3. EXECUTE ---
# Configuration
SPLIT_DATE = '2024-12-01'
SEQUENCE_LENGTH = 12

print(f"Preparing Walk-Forward Step 1: Predict Jan 2025")
X_train, y_train, X_test, y_test, test_companies = get_train_test_data_numpy(
    final_df, 
    SPLIT_DATE, 
    SEQUENCE_LENGTH, 
    feature_cols, 
    'target'
)

print("\n--- DATA SHAPES ---")
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"X_test:  {X_test.shape}")
print(f"y_test:  {y_test.shape}")

Preparing Walk-Forward Step 1: Predict Jan 2025
Generating TRAIN sequences (Target < 2024-12-01)...


Building Sequences: 100%|██████████| 2850/2850 [00:01<00:00, 2759.85it/s]


Generating TEST sequences (Predicting Jan 2025)...


Building Test Set: 100%|██████████| 2850/2850 [00:01<00:00, 2254.25it/s]



--- DATA SHAPES ---
X_train: (128250, 12, 42)
y_train: (128250,)
X_test:  (2850, 12, 42)
y_test:  (2850,)


In [28]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils.class_weight import compute_class_weight

def build_lstm_model(input_shape, learning_rate=0.001):
    model = Sequential()
    
    # 1. Input Layer matching your (TimeSteps, Features)
    model.add(Input(shape=input_shape))
    
    # 2. Masking (Optional but good practice)
    # Ignores timesteps where all features are 0 (padding)
    model.add(Masking(mask_value=0.0))
    
    # 3. LSTM Layer
    # units=64: Can be tuned (32, 64, 128)
    model.add(LSTM(units=64, return_sequences=False))
    
    # 4. Dropout for Regularization
    # Prevents overfitting on the training data
    model.add(Dropout(0.3))
    
    # 5. Output Layer
    # Sigmoid activation outputs a probability between 0.0 and 1.0
    model.add(Dense(1, activation='sigmoid'))
    
    # 6. Compile
    # AUC is the best metric for your imbalanced data
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(learning_rate=learning_rate),
        metrics=[tf.keras.metrics.AUC(name='auc'), tf.keras.metrics.Recall(name='recall')]
    )
    
    return model

# --- CALCULATE CLASS WEIGHTS ---
# Layoffs are rare (1.5%). If we don't weight them, the model learns to say "0" always.
# We punish the model ~60x more for missing a layoff than for false alarm.
weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(weights))
print(f"Class Weights: {class_weights}")
# Expected output: {0: 0.51, 1: ~33.0} (Exact numbers will vary based on your data)



Class Weights: {0: 0.5087993525453853, 1: 28.911181244364293}


In [30]:
# 1. Setup Model
# X_train shape is (Samples, 12, N_Features)
input_shape = (X_train.shape[1], X_train.shape[2])
model = build_lstm_model(input_shape)

# 2. Setup Callbacks
# Stop training if validation loss doesn't improve for 5 epochs
early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience=5, 
    restore_best_weights=True,
    verbose=1
)

# 3. Train
print("Starting Training for Jan 2025 Prediction...")
history = model.fit(
    X_train, y_train,
    epochs=50,                  # Max epochs (will stop early likely)
    batch_size=1024,            # Large batch size helps with noise
    validation_split=0.2,       # Use 20% of history to validate model quality
    class_weight=class_weights, # CRITICAL: Apply the imbalance fix
    callbacks=[early_stopping],
    verbose=1
)



Starting Training for Jan 2025 Prediction...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 11: early stopping


In [31]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

# 1. Predict on the Test Set (Jan 2025)
# These are probabilities (0.0 to 1.0)
y_pred_probs = model.predict(X_test, verbose=0)

# 2. Evaluation
# ROC-AUC (Global Ranking Quality)
roc_score = roc_auc_score(y_test, y_pred_probs)

# PR-AUC (Precision-Recall Area Under Curve)
# This is the "Truth Teller" for rare events.
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_probs)
pr_auc = auc(recall, precision)

print("\n--- RESULTS FOR JAN 2025 PREDICTION ---")
print(f"ROC-AUC Score: {roc_score:.4f} (Baseline: 0.5)")
print(f"PR-AUC Score:  {pr_auc:.4f} (Baseline: {y_test.mean():.4f})")

# 3. Inspect the Riskiest Companies
results_df = pd.DataFrame({
    'company': test_companies,
    'actual_layoff': y_test,
    'predicted_risk': y_pred_probs.flatten()
})

# Show top 5 highest risk companies
print("\nTop 5 Highest Predicted Risks:")
print(results_df.sort_values('predicted_risk', ascending=False).head(5))

# Show actual hits (Where we predicted high risk AND they fired)
print("\nSuccessful Hits (Risk > 0.5 and Actual = 1):")
print(results_df[(results_df['predicted_risk'] > 0.5) & (results_df['actual_layoff'] == 1)].head())


--- RESULTS FOR JAN 2025 PREDICTION ---
ROC-AUC Score: 0.7944 (Baseline: 0.5)
PR-AUC Score:  0.0314 (Baseline: 0.0067)

Top 5 Highest Predicted Risks:
        company  actual_layoff  predicted_risk
100      Amazon              0        0.903710
1044     Google              0        0.855753
2062     Rivian              0        0.842747
1556  Microsoft              1        0.827940
138       Apple              0        0.814330

Successful Hits (Risk > 0.5 and Actual = 1):
        company  actual_layoff  predicted_risk
1556  Microsoft              1        0.827940
2166  ShareChat              1        0.585790
2252  SolarEdge              1        0.757909
2329     Stripe              1        0.503449
2424     Textio              1        0.582951


In [32]:
from sklearn.metrics import (
    roc_auc_score, 
    precision_recall_curve, 
    auc, 
    classification_report, 
    confusion_matrix, 
    f1_score
)
import pandas as pd

# 1. Predict on the Test Set (Jan 2025)
# These are probabilities (0.0 to 1.0)
y_pred_probs = model.predict(X_test, verbose=0)

# ==========================================
# PART A: PROBABILITY METRICS (Ranking Quality)
# ==========================================
# ROC-AUC (Global Ranking Quality)
roc_score = roc_auc_score(y_test, y_pred_probs)

# PR-AUC (Precision-Recall Area Under Curve)
# This is the "Truth Teller" for rare events.
precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred_probs)
pr_auc = auc(recall_curve, precision_curve)

print("\n--- PROBABILITY METRICS (JAN 2025) ---")
print(f"ROC-AUC Score: {roc_score:.4f} (Baseline: 0.5)")
print(f"PR-AUC Score:  {pr_auc:.4f} (Baseline: {y_test.mean():.4f})")

# ==========================================
# PART B: CLASSIFICATION METRICS (Hard Decisions)
# ==========================================
# We use a threshold to convert probability -> 0 or 1.
# Standard is 0.5, but for rare events like layoffs, you might experiment with lower (e.g., 0.3)
THRESHOLD = 0.3
y_pred_classes = (y_pred_probs > THRESHOLD).astype(int)

print(f"\n--- CLASSIFICATION REPORT (Threshold = {THRESHOLD}) ---")
print(classification_report(y_test, y_pred_classes, target_names=['No Layoff', 'Layoff']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_classes)
print("Confusion Matrix:")
print(cm)
# [TN, FP]
# [FN, TP]

# Specific F1 Score for the Positive Class
f1 = f1_score(y_test, y_pred_classes)
print(f"\nF1-Score (Positive Class): {f1:.4f}")

# Interpretation
tp = cm[1, 1] # True Positives (Caught)
fn = cm[1, 0] # False Negatives (Missed)
fp = cm[0, 1] # False Positives (False Alarm)

print(f"\nSummary:")
print(f"- Correctly caught {tp} layoffs.")
print(f"- Missed {fn} layoffs.")
print(f"- Raised {fp} false alarms.")

# ==========================================
# PART C: INSPECTION (Qualitative Check)
# ==========================================
# Inspect the Riskiest Companies
results_df = pd.DataFrame({
    'company': test_companies,
    'actual_layoff': y_test,
    'predicted_risk': y_pred_probs.flatten()
})

# Show top 5 highest risk companies
print("\nTop 5 Highest Predicted Risks:")
print(results_df.sort_values('predicted_risk', ascending=False).head(5))

# Show actual hits (Where we predicted high risk AND they fired)
print(f"\nSuccessful Hits (Risk > {THRESHOLD} and Actual = 1):")
print(results_df[(results_df['predicted_risk'] > THRESHOLD) & (results_df['actual_layoff'] == 1)].head())


--- PROBABILITY METRICS (JAN 2025) ---
ROC-AUC Score: 0.7944 (Baseline: 0.5)
PR-AUC Score:  0.0314 (Baseline: 0.0067)

--- CLASSIFICATION REPORT (Threshold = 0.3) ---
              precision    recall  f1-score   support

   No Layoff       1.00      0.69      0.82      2831
      Layoff       0.02      0.79      0.03        19

    accuracy                           0.69      2850
   macro avg       0.51      0.74      0.43      2850
weighted avg       0.99      0.69      0.81      2850

Confusion Matrix:
[[1965  866]
 [   4   15]]

F1-Score (Positive Class): 0.0333

Summary:
- Correctly caught 15 layoffs.
- Missed 4 layoffs.
- Raised 866 false alarms.

Top 5 Highest Predicted Risks:
        company  actual_layoff  predicted_risk
100      Amazon              0        0.903710
1044     Google              0        0.855753
2062     Rivian              0        0.842747
1556  Microsoft              1        0.827940
138       Apple              0        0.814330

Successful Hits (Risk 

In [33]:
# --- CONFIGURATION ---
PREDICTION_MONTHS = [
    '2025-01-01', '2025-02-01', '2025-03-01', '2025-04-01',
    '2025-05-01', '2025-06-01', '2025-07-01', '2025-08-01',
    '2025-09-01', '2025-10-01', '2025-11-01', '2025-12-01'
]
SEQUENCE_LENGTH = 12
results_list = []

print(f"Starting Walk-Forward Prediction for {len(PREDICTION_MONTHS)} months...")

for i, pred_date_str in enumerate(PREDICTION_MONTHS):
    # 1. Define Dates
    # We train on everything UP TO the month before prediction
    pred_ts = pd.Timestamp(pred_date_str)
    split_ts = pred_ts - pd.DateOffset(months=1)
    split_date_str = split_ts.strftime('%Y-%m-%d')
    
    print(f"\n{'='*60}")
    print(f"ROUND {i+1}/{len(PREDICTION_MONTHS)}: Training up to {split_date_str} -> Predicting {pred_date_str}")
    print(f"{'='*60}")
    
    # 2. Get Data (Expanding Window)
    # This function (from previous steps) splits data based on the date provided
    X_train, y_train, X_test, y_test, test_companies = get_train_test_data_numpy(
        final_df, split_date_str, SEQUENCE_LENGTH, feature_cols, 'target'
    )
    
    # Safety Check
    if len(X_test) == 0:
        print(f"No test data for {pred_date_str}. Skipping.")
        continue

    # 3. Compute Class Weights DYNAMICALLY
    # As the training set grows, the balance might change slightly
    weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(y_train),
        y=y_train
    )
    class_weights = dict(enumerate(weights))
    print(f"   > Training Samples: {len(y_train)} | Class Weights: {class_weights}")
    
    # 4. Build & Train Model
    # We rebuild fresh to avoid leakage from previous loop iterations
    model = build_lstm_model((X_train.shape[1], X_train.shape[2]))
    
    early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    history = model.fit(
        X_train, y_train,
        epochs=20,               # Enough for convergence with early stopping
        batch_size=1024,
        validation_split=0.1,    # Use last 10% of training data for validation
        class_weight=class_weights, # <--- APPLYING YOUR WEIGHTS HERE
        callbacks=[early_stop],
        verbose=0                # Silent training
    )
    
    # 5. Predict
    probs = model.predict(X_test, verbose=0).flatten()
    
    # 6. Generate Monthly Report
    # We find the 'best' threshold for this month just to illustrate performance potential
    # (In production, you'd pick one fixed threshold, but for reporting we show the best F1)
    
    # Check if we have any layoffs to evaluate against
    if sum(y_test) > 0:
        precisions, recalls, thresholds = precision_recall_curve(y_test, probs)
        f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
        best_idx = np.argmax(f1_scores)
        current_threshold = thresholds[best_idx]
        
        # Apply Threshold
        preds = (probs >= current_threshold).astype(int)
        
        # Print Report
        print(f"\n--- REPORT FOR {pred_date_str} (Best Thresh: {current_threshold:.3f}) ---")
        print(classification_report(y_test, preds, target_names=['Safe', 'Layoff']))
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, preds))
        
        # [TN, FP]
        # [FN, TP]
    else:
        print(f"\n--- REPORT FOR {pred_date_str} ---")
        print("No actual layoffs occurred in this test month.")
        print(f"Max predicted risk was: {probs.max():.4f}")

    # 7. Store Results
    month_df = pd.DataFrame({
        'prediction_date': pred_date_str,
        'company': test_companies,
        'predicted_risk': probs,
        'actual_layoff': y_test
    })
    results_list.append(month_df)

# --- END LOOP ---

Starting Walk-Forward Prediction for 12 months...

ROUND 1/12: Training up to 2024-12-01 -> Predicting 2025-01-01
Generating TRAIN sequences (Target < 2024-12-01)...


Building Sequences: 100%|██████████| 2850/2850 [00:01<00:00, 2791.09it/s]


Generating TEST sequences (Predicting Jan 2025)...


Building Test Set: 100%|██████████| 2850/2850 [00:01<00:00, 2189.40it/s]


   > Training Samples: 128250 | Class Weights: {0: 0.5087993525453853, 1: 28.911181244364293}

--- REPORT FOR 2025-01-01 (Best Thresh: 0.722) ---
              precision    recall  f1-score   support

        Safe       0.99      0.99      0.99      2831
      Layoff       0.10      0.11      0.10        19

    accuracy                           0.99      2850
   macro avg       0.54      0.55      0.55      2850
weighted avg       0.99      0.99      0.99      2850

Confusion Matrix:
[[2812   19]
 [  17    2]]

ROUND 2/12: Training up to 2025-01-01 -> Predicting 2025-02-01
Generating TRAIN sequences (Target < 2025-01-01)...


Building Sequences: 100%|██████████| 2850/2850 [00:01<00:00, 2643.77it/s]


Generating TEST sequences (Predicting Jan 2025)...


Building Test Set: 100%|██████████| 2850/2850 [00:01<00:00, 2246.67it/s]


   > Training Samples: 131100 | Class Weights: {0: 0.508667918613133, 1: 29.34198746642793}

--- REPORT FOR 2025-02-01 (Best Thresh: 0.664) ---
              precision    recall  f1-score   support

        Safe       0.99      1.00      0.99      2809
      Layoff       0.50      0.07      0.13        41

    accuracy                           0.99      2850
   macro avg       0.74      0.54      0.56      2850
weighted avg       0.98      0.99      0.98      2850

Confusion Matrix:
[[2806    3]
 [  38    3]]

ROUND 3/12: Training up to 2025-02-01 -> Predicting 2025-03-01
Generating TRAIN sequences (Target < 2025-02-01)...


Building Sequences: 100%|██████████| 2850/2850 [00:01<00:00, 2557.11it/s]


Generating TEST sequences (Predicting Jan 2025)...


Building Test Set: 100%|██████████| 2850/2850 [00:01<00:00, 2207.61it/s]


   > Training Samples: 133950 | Class Weights: {0: 0.508553725597394, 1: 29.727030625832224}

--- REPORT FOR 2025-03-01 (Best Thresh: 0.595) ---
              precision    recall  f1-score   support

        Safe       1.00      0.97      0.98      2836
      Layoff       0.06      0.36      0.11        14

    accuracy                           0.97      2850
   macro avg       0.53      0.67      0.55      2850
weighted avg       0.99      0.97      0.98      2850

Confusion Matrix:
[[2760   76]
 [   9    5]]

ROUND 4/12: Training up to 2025-03-01 -> Predicting 2025-04-01
Generating TRAIN sequences (Target < 2025-03-01)...


Building Sequences: 100%|██████████| 2850/2850 [00:01<00:00, 2187.46it/s]


Generating TEST sequences (Predicting Jan 2025)...


Building Test Set: 100%|██████████| 2850/2850 [00:01<00:00, 2085.84it/s]


   > Training Samples: 136800 | Class Weights: {0: 0.5085275006319421, 1: 29.81691368788143}

--- REPORT FOR 2025-04-01 (Best Thresh: 0.735) ---
              precision    recall  f1-score   support

        Safe       1.00      0.99      1.00      2835
      Layoff       0.17      0.20      0.18        15

    accuracy                           0.99      2850
   macro avg       0.58      0.60      0.59      2850
weighted avg       0.99      0.99      0.99      2850

Confusion Matrix:
[[2820   15]
 [  12    3]]

ROUND 5/12: Training up to 2025-04-01 -> Predicting 2025-05-01
Generating TRAIN sequences (Target < 2025-04-01)...


Building Sequences: 100%|██████████| 2850/2850 [00:01<00:00, 2524.60it/s]


Generating TEST sequences (Predicting Jan 2025)...


Building Test Set: 100%|██████████| 2850/2850 [00:01<00:00, 2137.46it/s]


   > Training Samples: 139650 | Class Weights: {0: 0.5084023823739279, 1: 30.253466204506065}

--- REPORT FOR 2025-05-01 (Best Thresh: 0.735) ---
              precision    recall  f1-score   support

        Safe       0.99      0.99      0.99      2830
      Layoff       0.20      0.20      0.20        20

    accuracy                           0.99      2850
   macro avg       0.60      0.60      0.60      2850
weighted avg       0.99      0.99      0.99      2850

Confusion Matrix:
[[2814   16]
 [  16    4]]

ROUND 6/12: Training up to 2025-05-01 -> Predicting 2025-06-01
Generating TRAIN sequences (Target < 2025-05-01)...


Building Sequences: 100%|██████████| 2850/2850 [00:01<00:00, 2427.12it/s]


Generating TEST sequences (Predicting Jan 2025)...


Building Test Set: 100%|██████████| 2850/2850 [00:01<00:00, 2098.19it/s]


   > Training Samples: 142500 | Class Weights: {0: 0.5082859527597252, 1: 30.671545415411106}

--- REPORT FOR 2025-06-01 (Best Thresh: 0.801) ---
              precision    recall  f1-score   support

        Safe       1.00      1.00      1.00      2838
      Layoff       0.36      0.33      0.35        12

    accuracy                           0.99      2850
   macro avg       0.68      0.67      0.67      2850
weighted avg       0.99      0.99      0.99      2850

Confusion Matrix:
[[2831    7]
 [   8    4]]

ROUND 7/12: Training up to 2025-06-01 -> Predicting 2025-07-01
Generating TRAIN sequences (Target < 2025-06-01)...


Building Sequences: 100%|██████████| 2850/2850 [00:01<00:00, 2242.26it/s]


Generating TEST sequences (Predicting Jan 2025)...


Building Test Set: 100%|██████████| 2850/2850 [00:01<00:00, 1990.18it/s]


   > Training Samples: 145350 | Class Weights: {0: 0.5081919066898823, 1: 31.017925736235597}

--- REPORT FOR 2025-07-01 (Best Thresh: 0.805) ---
              precision    recall  f1-score   support

        Safe       0.99      1.00      1.00      2833
      Layoff       0.40      0.12      0.18        17

    accuracy                           0.99      2850
   macro avg       0.70      0.56      0.59      2850
weighted avg       0.99      0.99      0.99      2850

Confusion Matrix:
[[2830    3]
 [  15    2]]

ROUND 8/12: Training up to 2025-07-01 -> Predicting 2025-08-01
Generating TRAIN sequences (Target < 2025-07-01)...


Building Sequences: 100%|██████████| 2850/2850 [00:01<00:00, 2421.02it/s]


Generating TEST sequences (Predicting Jan 2025)...


Building Test Set: 100%|██████████| 2850/2850 [00:01<00:00, 2114.94it/s]


   > Training Samples: 148200 | Class Weights: {0: 0.5080736398230998, 1: 31.46496815286624}

--- REPORT FOR 2025-08-01 (Best Thresh: 0.726) ---
              precision    recall  f1-score   support

        Safe       1.00      0.99      1.00      2837
      Layoff       0.06      0.08      0.07        13

    accuracy                           0.99      2850
   macro avg       0.53      0.54      0.53      2850
weighted avg       0.99      0.99      0.99      2850

Confusion Matrix:
[[2822   15]
 [  12    1]]

ROUND 9/12: Training up to 2025-08-01 -> Predicting 2025-09-01
Generating TRAIN sequences (Target < 2025-08-01)...


Building Sequences: 100%|██████████| 2850/2850 [00:01<00:00, 2341.83it/s]


Generating TEST sequences (Predicting Jan 2025)...


Building Test Set: 100%|██████████| 2850/2850 [00:01<00:00, 2091.79it/s]


   > Training Samples: 151050 | Class Weights: {0: 0.5079769703654878, 1: 31.840219224283306}

--- REPORT FOR 2025-09-01 (Best Thresh: 0.693) ---
              precision    recall  f1-score   support

        Safe       1.00      0.99      1.00      2833
      Layoff       0.24      0.29      0.26        17

    accuracy                           0.99      2850
   macro avg       0.62      0.64      0.63      2850
weighted avg       0.99      0.99      0.99      2850

Confusion Matrix:
[[2817   16]
 [  12    5]]

ROUND 10/12: Training up to 2025-09-01 -> Predicting 2025-10-01
Generating TRAIN sequences (Target < 2025-09-01)...


Building Sequences: 100%|██████████| 2850/2850 [00:01<00:00, 2386.10it/s]


Generating TEST sequences (Predicting Jan 2025)...


Building Test Set: 100%|██████████| 2850/2850 [00:01<00:00, 2401.51it/s]


   > Training Samples: 153900 | Class Weights: {0: 0.5078705078705079, 1: 32.264150943396224}

--- REPORT FOR 2025-10-01 (Best Thresh: 0.793) ---
              precision    recall  f1-score   support

        Safe       0.99      1.00      1.00      2831
      Layoff       0.60      0.16      0.25        19

    accuracy                           0.99      2850
   macro avg       0.80      0.58      0.62      2850
weighted avg       0.99      0.99      0.99      2850

Confusion Matrix:
[[2829    2]
 [  16    3]]

ROUND 11/12: Training up to 2025-10-01 -> Predicting 2025-11-01
Generating TRAIN sequences (Target < 2025-10-01)...


Building Sequences: 100%|██████████| 2850/2850 [00:01<00:00, 2771.46it/s]


Generating TEST sequences (Predicting Jan 2025)...


Building Test Set: 100%|██████████| 2850/2850 [00:01<00:00, 1898.09it/s]


   > Training Samples: 156750 | Class Weights: {0: 0.5077811179931064, 1: 32.629059117402164}

--- REPORT FOR 2025-11-01 (Best Thresh: 0.786) ---
              precision    recall  f1-score   support

        Safe       0.99      1.00      1.00      2834
      Layoff       0.08      0.06      0.07        16

    accuracy                           0.99      2850
   macro avg       0.54      0.53      0.53      2850
weighted avg       0.99      0.99      0.99      2850

Confusion Matrix:
[[2823   11]
 [  15    1]]

ROUND 12/12: Training up to 2025-11-01 -> Predicting 2025-12-01
Generating TRAIN sequences (Target < 2025-11-01)...


Building Sequences: 100%|██████████| 2850/2850 [00:01<00:00, 2268.97it/s]


Generating TEST sequences (Predicting Jan 2025)...


Building Test Set: 100%|██████████| 2850/2850 [00:00<00:00, 3831.13it/s]


   > Training Samples: 159600 | Class Weights: {0: 0.5077014104937683, 1: 32.96158612143742}

--- REPORT FOR 2025-12-01 (Best Thresh: 0.906) ---
              precision    recall  f1-score   support

        Safe       1.00      1.00      1.00      2836
      Layoff       1.00      0.07      0.13        14

    accuracy                           1.00      2850
   macro avg       1.00      0.54      0.57      2850
weighted avg       1.00      1.00      0.99      2850

Confusion Matrix:
[[2836    0]
 [  13    1]]


In [34]:
# Concatenate all months
all_predictions = pd.concat(results_list, ignore_index=True)

print(f"\n{'='*60}")
print("GLOBAL SUMMARY (JAN 2025 - DEC 2025)")
print(f"{'='*60}")

# Find Optimal Global Threshold
y_true_all = all_predictions['actual_layoff']
y_prob_all = all_predictions['predicted_risk']

precision, recall, thresholds = precision_recall_curve(y_true_all, y_prob_all)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
best_global_idx = np.argmax(f1_scores)
best_global_threshold = thresholds[best_global_idx]

print(f"Total Companies Evaluated: {len(all_predictions)}")
print(f"Total Actual Layoffs: {y_true_all.sum()}")
print(f"Optimal Global Threshold: {best_global_threshold:.4f}")
print(f"Max F1 Score: {f1_scores[best_global_idx]:.4f}")

# Apply Threshold
all_predictions['predicted_label'] = (all_predictions['predicted_risk'] >= best_global_threshold).astype(int)

# Final Confusion Matrix
cm = confusion_matrix(y_true_all, all_predictions['predicted_label'])
print("\nGlobal Confusion Matrix:")
print(cm)

print("\nGlobal Classification Report:")
print(classification_report(y_true_all, all_predictions['predicted_label'], target_names=['Safe', 'Layoff']))


GLOBAL SUMMARY (JAN 2025 - DEC 2025)
Total Companies Evaluated: 34200
Total Actual Layoffs: 217
Optimal Global Threshold: 0.7649
Max F1 Score: 0.1212

Global Confusion Matrix:
[[33890    93]
 [  197    20]]

Global Classification Report:
              precision    recall  f1-score   support

        Safe       0.99      1.00      1.00     33983
      Layoff       0.18      0.09      0.12       217

    accuracy                           0.99     34200
   macro avg       0.59      0.54      0.56     34200
weighted avg       0.99      0.99      0.99     34200

