<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [7]</a>'.</span>

In [2]:
# declare a list tasks whose products you want to use as inputs
upstream = None


In [3]:
# Parameters
product = {"nb": "/home/luuka/thesis/workspace/output/01_preprocess.ipynb", "train": "/home/luuka/thesis/workspace/data/processed/train.parquet", "val": "/home/luuka/thesis/workspace/data/processed/val.parquet", "test": "/home/luuka/thesis/workspace/data/processed/test.parquet", "config": "/home/luuka/thesis/workspace/data/processed/preprocessing_config.json"}


In [4]:
# Imports and Setup
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import logging
import json
from tqdm import tqdm

# Add project root to path
sys.path.append(str(Path.cwd().parent))

from src.data_tools.preprocessor import load_and_clean_data
from src.data_tools.dataset import create_data_splits

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [5]:
# Configuration
# Define paths
DATA_DIR = Path('../data')
PROCESSED_DATA_DIR = DATA_DIR / 'processed'

# Create directories if they don't exist
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

# Model configuration
config = {
    'min_text_length': 5,
    'train_size': 0.7,
    'val_size': 0.15,
    'test_size': 0.15,
    'random_state': 42
}

# Save config
with open(PROCESSED_DATA_DIR / 'preprocessing_config.json', 'w') as f:
    json.dump(config, f, indent=2)

print("Configuration saved!")

Configuration saved!


In [6]:
# Cell 3: Load and examine raw data
# Load Czech comment data
df_raw = load_and_clean_data(str(DATA_DIR))

print("Raw dataset info:")
print(f"Total samples: {len(df_raw)}")
print(f"Columns: {df_raw.columns.tolist()}")
print("\nMissing values:")
print(df_raw.isnull().sum())

# Show class distribution
print("\nClass distribution:")
print(df_raw['troll'].value_counts(normalize=True))

INFO:src.data_tools.preprocessor:Loading Russian troll tweets...
INFO:src.data_tools.preprocessor:Loading Sentiment140 tweets...
INFO:src.data_tools.preprocessor:Loading celebrity tweets...
INFO:src.data_tools.preprocessor:Loading Twitter JSON data from non_troll_politics folder...
INFO:src.data_tools.preprocessor:Combining datasets...
INFO:src.data_tools.preprocessor:Filtering accounts with few tweets...


Raw dataset info:
Total samples: 2701940
Columns: ['account', 'tweet', 'troll']

Missing values:
account    0
tweet      0
troll      0
dtype: int64

Class distribution:
troll
1    0.77938
0    0.22062
Name: proportion, dtype: float64


In [6]:
print("Available columns:", df_raw.columns.tolist())

Available columns: ['account', 'tweet', 'troll']


<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [7]:
# Create Train/Val/Test Splits
# Create splits ensuring no author overlap
train_df, val_df, test_df = create_data_splits(
    df_raw,
    train_size=config['train_size'],
    val_size=config['val_size'],
    test_size=config['test_size'],
    random_state=config['random_state']
)

print("Dataset splits:")
print(f"Train: {len(train_df)} samples, {train_df['account'].nunique()} authors")
print(f"Val:   {len(val_df)} samples, {val_df['account'].nunique()} authors")
print(f"Test:  {len(test_df)} samples, {test_df['account'].nunique()} authors")

KeyError: 'author'

In [None]:
# Save Processed Data
# Save splits to parquet
for split_name, split_df in [
    ('train', train_df),
    ('val', val_df),
    ('test', test_df)
]:
    output_path = PROCESSED_DATA_DIR / f'{split_name}.parquet'
    split_df.to_parquet(output_path, index=False)
    print(f"Saved {split_name} split to {output_path}")

# Save preprocessing config
import json
config_path = PROCESSED_DATA_DIR / 'preprocessing_config.json'
with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)
print(f"\nSaved preprocessing config to {config_path}")