In [None]:
# Setup: Mount Google Drive using rclone
!rclone mount drive: /mnt/drive --vfs-cache-mode writes --daemon

# Verify mount
!df -h /mnt/drive

# Create outputs directory if it doesn't exist
!mkdir -p /mnt/drive/outputs

In [None]:
# Load Phase 1.5 results for top 2 config selection
# Assuming Phase 1.5 results are stored in /mnt/drive/outputs/phase1.5_results.json or similar
# Adjust path as needed based on actual Phase 1.5 output
import json
import pandas as pd

# Load results (example path; update if different)
phase1_5_results_path = '/mnt/drive/outputs/phase1.5_results.json'
with open(phase1_5_results_path, 'r') as f:
    results = json.load(f)

# Convert to DataFrame for analysis
df_results = pd.DataFrame(results)

# Select top 2 configs based on key metric (e.g., F1 score)
# Assuming results contain config details and metrics
top_configs = df_results.nlargest(2, 'f1_score')[['config_id', 'output_dir', 'f1_score']]
top1_config = top_configs.iloc[0]
top2_config = top_configs.iloc[1]

print('Top 1 Config:', top1_config)
print('Top 2 Config:', top2_config)

# For demonstration, assume top1 uses --output_dir '/mnt/drive/outputs/phase2_top1'
# top2 uses --output_dir '/mnt/drive/outputs/phase2_top2'
# In practice, derive full args from loaded config

In [None]:
# Sequential Training: Top 1 Config - 3 epochs, 30k samples
# Do not run this cell; structure only
# Modify args based on top1_config; example using train_deberta_local.py
!python notebooks/scripts/train_deberta_local.py \
    --data_path data/goemotions/train.jsonl \
    --val_path data/goemotions/val.jsonl \
    --output_dir '/mnt/drive/outputs/phase2_top1' \
    --num_train_epochs 3 \
    --max_train_samples 30000 \
    --model_name_or_path microsoft/deberta-v3-base \
    --per_device_train_batch_size 16 \
    --per_device_eval_batch_size 16 \
    --learning_rate 2e-5 \
    # Add other args from top1_config as needed

In [None]:
# Sequential Training: Top 2 Config - 3 epochs, 30k samples
# Do not run this cell; structure only
# Modify args based on top2_config; example using train_deberta_local.py
!python notebooks/scripts/train_deberta_local.py \
    --data_path data/goemotions/train.jsonl \
    --val_path data/goemotions/val.jsonl \
    --output_dir '/mnt/drive/outputs/phase2_top2' \
    --num_train_epochs 3 \
    --max_train_samples 30000 \
    --model_name_or_path microsoft/deberta-v3-base \
    --per_device_train_batch_size 16 \
    --per_device_eval_batch_size 16 \
    --learning_rate 2e-5 \
    # Add other args from top2_config as needed

In [None]:
# Analysis: Compare Phase 2 results
# Load results from phase2_top1 and phase2_top2
import json
import pandas as pd
import matplotlib.pyplot as plt

# Load Phase 2 results (assuming trainer_state.json or similar)
top1_results = json.load(open('/mnt/drive/outputs/phase2_top1/trainer_state.json'))
top2_results = json.load(open('/mnt/drive/outputs/phase2_top2/trainer_state.json'))

# Example: Extract eval metrics and plot
df_top1 = pd.DataFrame(top1_results.get('log_history', []))
df_top2 = pd.DataFrame(top2_results.get('log_history', []))

# Plot comparison (e.g., eval_loss over epochs)
plt.figure(figsize=(10, 5))
plt.plot(df_top1['epoch'], df_top1['eval_loss'], label='Top1')
plt.plot(df_top2['epoch'], df_top2['eval_loss'], label='Top2')
plt.xlabel('Epoch')
plt.ylabel('Eval Loss')
plt.legend()
plt.title('Phase 2 Training Comparison')
plt.show()

# Summary statistics
print('Top1 Final F1:', df_top1['eval_f1'].iloc[-1])
print('Top2 Final F1:', df_top2['eval_f1'].iloc[-1])

In [None]:
# Post-training: Copy local backups to Drive if needed
# Reconnect if needed
!rclone config reconnect drive:

# Copy local outputs to Drive backup path
!rclone --drive-pacer-min-sleep=1s copy outputs/ "drive:00_Projects/TechLabs-2025/Final_Project/TRAINING/GoEmotions-DeBERTa-Backup/outputs"

# Specifically copy phase2 outputs
!rclone --drive-pacer-min-sleep=1s copy /mnt/drive/outputs/phase2_top1 "drive:00_Projects/TechLabs-2025/Final_Project/TRAINING/GoEmotions-DeBERTa-Backup/phase2_top1"
!rclone --drive-pacer-min-sleep=1s copy /mnt/drive/outputs/phase2_top2 "drive:00_Projects/TechLabs-2025/Final_Project/TRAINING/GoEmotions-DeBERTa-Backup/phase2_top2"

print('Backup copy completed')