In [1]:
# declare a list tasks whose products you want to use as inputs
upstream = None


In [1]:
# Imports and Setup
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import logging


# Add project root to path
sys.path.append(str(Path.cwd().parent))

from src.data_tools.preprocessor import load_and_clean_data
from src.data_tools.dataset import create_data_splits

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
# Configuration
# Define paths
DATA_DIR = Path('data')
PROCESSED_DATA_DIR = DATA_DIR / 'processed'

# Create directories if they don't exist
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

# Model configuration
config = {
    'train_size': 0.7,
    'val_size': 0.15,
    'test_size': 0.15,
    'random_state': 42
}


In [3]:
# Load Czech comment data
df_raw = load_and_clean_data(DATA_DIR, max_tweets_per_source=10000, max_tweets_per_author=50)

df_raw = df_raw.rename(columns={'account': 'author'})
df_raw = df_raw.rename(columns={'tweet': 'text'})

print("Raw dataset info:")
print(f"Total samples: {len(df_raw)}")
print(f"Columns: {df_raw.columns.tolist()}")
print("\nMissing values:")
print(df_raw.isnull().sum())

# Show class distribution
print("\nClass distribution:")
print(df_raw['troll'].value_counts(normalize=True))

INFO:src.data_tools.preprocessor:Loading Russian troll tweets...
INFO:src.data_tools.preprocessor:Loading Sentiment140 tweets...
INFO:src.data_tools.preprocessor:Loading celebrity tweets...
INFO:src.data_tools.preprocessor:Loading manualy scraped tweets...
INFO:src.data_tools.preprocessor:Loading Twitter JSON data from non_troll_politics folder...
INFO:src.data_tools.preprocessor:Loading information operations tweets...
INFO:src.data_tools.preprocessor:Found 151 parquet files in information_operations folder and its subdirectories
INFO:src.data_tools.preprocessor:Information operations data distribution - Trolls: 585379, Non-trolls: 914607
INFO:src.data_tools.preprocessor:Loading data collected by Machova...
INFO:src.data_tools.preprocessor:Combining datasets...
INFO:src.data_tools.preprocessor:Filtering accounts with few tweets...


Raw dataset info:
Total samples: 831959
Columns: ['author', 'text', 'troll', 'language']

Missing values:
author         0
text           0
troll          0
language    1748
dtype: int64

Class distribution:
troll
0    0.826238
1    0.173762
Name: proportion, dtype: float64


In [7]:
# Count tweets by troll label and author stats
tweet_counts = df_raw['troll'].value_counts()
total_tweets = len(df_raw)

# Get author stats split by troll/non-troll
author_stats = df_raw.groupby(['author', 'troll']).size().reset_index()
troll_authors = author_stats[author_stats['troll'] == 1]['author'].nunique()
nontroll_authors = author_stats[author_stats['troll'] == 0]['author'].nunique()

# Calculate average tweets per author type
troll_tweets_per_author = df_raw[df_raw['troll'] == 1].groupby('author').size()
nontroll_tweets_per_author = df_raw[df_raw['troll'] == 0].groupby('author').size()
avg_troll_tweets = troll_tweets_per_author.mean()
avg_nontroll_tweets = nontroll_tweets_per_author.mean()

print("Tweet distribution:")
print(f"Troll tweets: {tweet_counts[1]:,} ({tweet_counts[1]/total_tweets:.1%})")
print(f"Non-troll tweets: {tweet_counts[0]:,} ({tweet_counts[0]/total_tweets:.1%})")
print(f"Total tweets: {total_tweets:,}")
print(f"\nAuthor statistics:")
print(f"Troll authors: {troll_authors:,}")
print(f"Non-troll authors: {nontroll_authors:,}")
print(f"Average tweets per troll author: {avg_troll_tweets:.1f}")
print(f"Average tweets per non-troll author: {avg_nontroll_tweets:.1f}")

Tweet distribution:
Troll tweets: 144,563 (17.4%)
Non-troll tweets: 687,396 (82.6%)
Total tweets: 831,959

Author statistics:
Troll authors: 4,555
Non-troll authors: 41,182
Average tweets per troll author: 31.7
Average tweets per non-troll author: 16.7


In [8]:
# Resample non-troll authors to only keep 20%
non_troll_authors = df_raw[df_raw['troll'] == 0]['author'].unique()
sampled_non_troll_authors = np.random.choice(
    non_troll_authors,
    size=int(len(non_troll_authors) * 0.2),
    replace=False
)

# Keep all troll authors and only sampled non-troll authors
troll_tweets = df_raw[df_raw['troll'] == 1]
sampled_non_troll_tweets = df_raw[
    (df_raw['troll'] == 0) & 
    (df_raw['author'].isin(sampled_non_troll_authors))
]
df_raw = pd.concat([troll_tweets, sampled_non_troll_tweets])

print("\nAfter resampling non-troll authors to 20%:")
print(f"Total samples: {len(df_raw)}")
print(f"Total authors: {df_raw['author'].nunique()}")
print("\nClass distribution:")
print(df_raw['troll'].value_counts(normalize=True))



After resampling non-troll authors to 20%:
Total samples: 282903
Total authors: 12791

Class distribution:
troll
1    0.510998
0    0.489002
Name: proportion, dtype: float64


In [12]:
# Count tweets by troll label and author stats
tweet_counts = df_raw['troll'].value_counts()
total_tweets = len(df_raw)

# Get author stats split by troll/non-troll
author_stats = df_raw.groupby(['author', 'troll']).size().reset_index()
troll_authors = author_stats[author_stats['troll'] == 1]['author'].nunique()
nontroll_authors = author_stats[author_stats['troll'] == 0]['author'].nunique()

# Calculate average tweets per author type
troll_tweets_per_author = df_raw[df_raw['troll'] == 1].groupby('author').size()
nontroll_tweets_per_author = df_raw[df_raw['troll'] == 0].groupby('author').size()
avg_troll_tweets = troll_tweets_per_author.mean()
avg_nontroll_tweets = nontroll_tweets_per_author.mean()

print("Tweet distribution:")
print(f"Troll tweets: {tweet_counts[1]:,} ({tweet_counts[1]/total_tweets:.1%})")
print(f"Non-troll tweets: {tweet_counts[0]:,} ({tweet_counts[0]/total_tweets:.1%})")
print(f"Total tweets: {total_tweets:,}")
print(f"\nAuthor statistics:")
print(f"Troll authors: {troll_authors:,} ({troll_authors/(troll_authors+nontroll_authors):.1%})")
print(f"Non-troll authors: {nontroll_authors:,} ({nontroll_authors/(troll_authors+nontroll_authors):.1%})")
print(f"Average tweets per troll author: {avg_troll_tweets:.1f}")
print(f"Average tweets per non-troll author: {avg_nontroll_tweets:.1f}")

Tweet distribution:
Troll tweets: 144,563 (51.1%)
Non-troll tweets: 138,340 (48.9%)
Total tweets: 282,903

Author statistics:
Troll authors: 4,555 (35.6%)
Non-troll authors: 8,236 (64.4%)
Average tweets per troll author: 31.7
Average tweets per non-troll author: 16.8


In [10]:
# Create Train/Val/Test Splits
# Create splits ensuring no author overlap
train_df, val_df, test_df = create_data_splits(
    df_raw,
    train_size=config['train_size'],
    val_size=config['val_size'],
    test_size=config['test_size'],
    random_state=config['random_state']
)

print("Dataset splits:")
print(f"Train: {len(train_df)} samples, {train_df['author'].nunique()} authors")
print(f"Val:   {len(val_df)} samples, {val_df['author'].nunique()} authors")
print(f"Test:  {len(test_df)} samples, {test_df['author'].nunique()} authors")

Dataset splits:
Train: 198385 samples, 8953 authors
Val:   42099 samples, 1919 authors
Test:  42419 samples, 1919 authors


In [11]:
# Save Processed Data
# Save splits to parquet
for split_name, split_df in [
    ('train_max_50', train_df),
    ('val_max_50', val_df),
    ('test_max_50', test_df)
]:
    output_path = PROCESSED_DATA_DIR / f'{split_name}.parquet'
    split_df.to_parquet(output_path, index=False)
    print(f"Saved {split_name} split to {output_path}")



Saved train_max_50 split to data/processed/train_max_50.parquet
Saved val_max_50 split to data/processed/val_max_50.parquet
Saved test_max_50 split to data/processed/test_max_50.parquet
