In [1]:
# declare a list tasks whose products you want to use as inputs
upstream = None


In [1]:
# Imports and Setup
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import logging
import json


# Add project root to path
sys.path.append(str(Path.cwd().parent))

from src.data_tools.preprocessor import load_and_clean_data
from src.data_tools.dataset import create_data_splits
from src.data_tools.preprocessor import save_data_to_json

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
# Configuration
# Define paths
DATA_DIR = Path('../data')
PROCESSED_DATA_DIR = DATA_DIR / 'processed'

# Create directories if they don't exist
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

# Model configuration
config = {
    'train_size': 0.7,
    'val_size': 0.15,
    'test_size': 0.15,
    'random_state': 42
}

# Save config
with open(PROCESSED_DATA_DIR / 'preprocessing_config.json', 'w') as f:
    json.dump(config, f, indent=2)

print("Configuration saved!")

Configuration saved!


In [None]:
# Load Czech comment data
df_raw = load_and_clean_data(DATA_DIR)

df_raw = df_raw.rename(columns={'account': 'author'})
df_raw = df_raw.rename(columns={'tweet': 'text'})

print("Raw dataset info:")
print(f"Total samples: {len(df_raw)}")
print(f"Columns: {df_raw.columns.tolist()}")
print("\nMissing values:")
print(df_raw.isnull().sum())

# Show class distribution
print("\nClass distribution:")
print(df_raw['troll'].value_counts(normalize=True))

INFO:src.data_tools.preprocessor:Loading Russian troll tweets...


INFO:src.data_tools.preprocessor:Loading Sentiment140 tweets...
INFO:src.data_tools.preprocessor:Loading celebrity tweets...
INFO:src.data_tools.preprocessor:Loading manualy scraped tweets...
INFO:src.data_tools.preprocessor:Loading Twitter JSON data from non_troll_politics folder...
INFO:src.data_tools.preprocessor:Loading information operations tweets...
INFO:src.data_tools.preprocessor:Loading data collected by Machova...
INFO:src.data_tools.preprocessor:Loading Civil Comments dataset...


README.md:   0%|          | 0.00/7.73k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/187M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1804874 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/97320 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/97320 [00:00<?, ? examples/s]

In [13]:
# Count tweets by troll label
tweet_counts = df_raw['troll'].value_counts()
total_tweets = len(df_raw)

print("Tweet distribution:")
print(f"Troll tweets: {tweet_counts[1]:,} ({tweet_counts[1]/total_tweets:.1%})")
print(f"Non-troll tweets: {tweet_counts[0]:,} ({tweet_counts[0]/total_tweets:.1%})")
print(f"Total tweets: {total_tweets:,}")

Tweet distribution:
Troll tweets: 3,731,059 (97.4%)
Non-troll tweets: 99,754 (2.6%)
Total tweets: 3,830,813


In [14]:
# First balance authors as before
author_labels = df_raw.groupby('author')['troll'].first()
troll_authors = author_labels[author_labels == 1].index
non_troll_authors = author_labels[author_labels == 0].index

# Determine target size for authors
target_author_size = min(len(troll_authors), len(non_troll_authors))

# Sample authors
np.random.seed(config['random_state'])
if len(troll_authors) > target_author_size:
    troll_authors = np.random.choice(troll_authors, size=target_author_size, replace=False)
if len(non_troll_authors) > target_author_size:
    non_troll_authors = np.random.choice(non_troll_authors, size=target_author_size, replace=False)

# Now balance tweets per author
max_tweets_per_author = 100  # Or whatever maximum you want to allow

df_balanced = []
for author in troll_authors:
    author_tweets = df_raw[df_raw['author'] == author]
    if len(author_tweets) > max_tweets_per_author:
        author_tweets = author_tweets.sample(n=max_tweets_per_author, random_state=config['random_state'])
    df_balanced.append(author_tweets)

for author in non_troll_authors:
    author_tweets = df_raw[df_raw['author'] == author]
    if len(author_tweets) > max_tweets_per_author:
        author_tweets = author_tweets.sample(n=max_tweets_per_author, random_state=config['random_state'])
    df_balanced.append(author_tweets)

df_balanced = pd.concat(df_balanced, ignore_index=True)

KeyboardInterrupt: 

In [5]:
# Create Train/Val/Test Splits
# Create splits ensuring no author overlap
train_df, val_df, test_df = create_data_splits(
    df_raw,
    train_size=config['train_size'],
    val_size=config['val_size'],
    test_size=config['test_size'],
    random_state=config['random_state']
)

print("Dataset splits:")
print(f"Train: {len(train_df)} samples, {train_df['author'].nunique()} authors")
print(f"Val:   {len(val_df)} samples, {val_df['author'].nunique()} authors")
print(f"Test:  {len(test_df)} samples, {test_df['author'].nunique()} authors")

Dataset splits:
Train: 2977763 samples, 18167 authors
Val:   574837 samples, 3893 authors
Test:  857416 samples, 3893 authors


In [6]:
# Save Processed Data
# Save splits to parquet
for split_name, split_df in [
    ('train', train_df),
    ('val', val_df),
    ('test', test_df)
]:
    output_path = PROCESSED_DATA_DIR / f'{split_name}.parquet'
    split_df.to_parquet(output_path, index=False)
    print(f"Saved {split_name} split to {output_path}")

# Save preprocessing config
import json
config_path = PROCESSED_DATA_DIR / 'preprocessing_config.json'
with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)
print(f"\nSaved preprocessing config to {config_path}")

Saved train split to ../data/processed/train.parquet
Saved val split to ../data/processed/val.parquet
Saved test split to ../data/processed/test.parquet

Saved preprocessing config to ../data/processed/preprocessing_config.json
