In [1]:
# declare a list tasks whose products you want to use as inputs
upstream = None


In [2]:
# Imports and Setup
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import logging
import json


# Add project root to path
sys.path.append(str(Path.cwd().parent))

from src.data_tools.preprocessor import load_and_clean_data
from src.data_tools.dataset import create_data_splits
from src.data_tools.preprocessor import save_data_to_json

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
# Configuration
# Define paths
DATA_DIR = Path('data')
PROCESSED_DATA_DIR = DATA_DIR / 'processed'

# Create directories if they don't exist
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

# Model configuration
config = {
    'train_size': 0.7,
    'val_size': 0.15,
    'test_size': 0.15,
    'random_state': 42
}


Configuration saved!


In [4]:
# Load Czech comment data
df_raw = load_and_clean_data(DATA_DIR, max_tweets_per_source=10000)

df_raw = df_raw.rename(columns={'account': 'author'})
df_raw = df_raw.rename(columns={'tweet': 'text'})

print("Raw dataset info:")
print(f"Total samples: {len(df_raw)}")
print(f"Columns: {df_raw.columns.tolist()}")
print("\nMissing values:")
print(df_raw.isnull().sum())

# Show class distribution
print("\nClass distribution:")
print(df_raw['troll'].value_counts(normalize=True))

INFO:src.data_tools.preprocessor:Loading Russian troll tweets...
INFO:src.data_tools.preprocessor:Loading Sentiment140 tweets...
INFO:src.data_tools.preprocessor:Loading celebrity tweets...
INFO:src.data_tools.preprocessor:Loading manualy scraped tweets...
INFO:src.data_tools.preprocessor:Loading Twitter JSON data from non_troll_politics folder...
INFO:src.data_tools.preprocessor:Loading information operations tweets...
INFO:src.data_tools.preprocessor:Found 151 parquet files in information_operations folder and its subdirectories
INFO:src.data_tools.preprocessor:Information operations data distribution - Trolls: 585379, Non-trolls: 914607
INFO:src.data_tools.preprocessor:Loading data collected by Machova...
INFO:src.data_tools.preprocessor:Loading Civil Comments dataset...
INFO:src.data_tools.preprocessor:Combining datasets...
INFO:src.data_tools.preprocessor:Filtering accounts with few tweets...


Raw dataset info:
Total samples: 2999248
Columns: ['author', 'text', 'troll', 'language']

Missing values:
author          0
text            0
troll           0
language    15764
dtype: int64

Class distribution:
troll
0    0.785914
1    0.214086
Name: proportion, dtype: float64


In [5]:
# Count tweets by troll label
tweet_counts = df_raw['troll'].value_counts()
total_tweets = len(df_raw)

print("Tweet distribution:")
print(f"Troll tweets: {tweet_counts[1]:,} ({tweet_counts[1]/total_tweets:.1%})")
print(f"Non-troll tweets: {tweet_counts[0]:,} ({tweet_counts[0]/total_tweets:.1%})")
print(f"Total tweets: {total_tweets:,}")

Tweet distribution:
Troll tweets: 642,096 (21.4%)
Non-troll tweets: 2,357,152 (78.6%)
Total tweets: 2,999,248


In [6]:
# # First balance authors
# author_labels = df_raw.groupby('author')['troll'].first()
# troll_authors = author_labels[author_labels == 1].index
# non_troll_authors = author_labels[author_labels == 0].index

# # Determine target size for authors
# target_author_size = min(len(troll_authors), len(non_troll_authors))

# # Sample authors
# np.random.seed(config['random_state'])
# if len(troll_authors) > target_author_size:
#     troll_authors = np.random.choice(troll_authors, size=target_author_size, replace=False)
# if len(non_troll_authors) > target_author_size:
#     non_troll_authors = np.random.choice(non_troll_authors, size=target_author_size, replace=False)

# # Now balance tweets per author
# max_tweets_per_author = 100  # Or whatever maximum you want to allow

# df_balanced = []
# for author in troll_authors:
#     author_tweets = df_raw[df_raw['author'] == author]
#     if len(author_tweets) > max_tweets_per_author:
#         author_tweets = author_tweets.sample(n=max_tweets_per_author, random_state=config['random_state'])
#     df_balanced.append(author_tweets)

# for author in non_troll_authors:
#     author_tweets = df_raw[df_raw['author'] == author]
#     if len(author_tweets) > max_tweets_per_author:
#         author_tweets = author_tweets.sample(n=max_tweets_per_author, random_state=config['random_state'])
#     df_balanced.append(author_tweets)

# df_balanced = pd.concat(df_balanced, ignore_index=True)

In [7]:
# # Count tweets by troll label
# tweet_counts = df_balanced['troll'].value_counts()
# total_tweets = len(df_balanced)

# print("Tweet distribution:")
# print(f"Troll tweets: {tweet_counts[1]:,} ({tweet_counts[1]/total_tweets:.1%})")
# print(f"Non-troll tweets: {tweet_counts[0]:,} ({tweet_counts[0]/total_tweets:.1%})")
# print(f"Total tweets: {total_tweets:,}")

Tweet distribution:
Troll tweets: 220,731 (89.8%)
Non-troll tweets: 25,169 (10.2%)
Total tweets: 245,900


In [16]:
# Create Train/Val/Test Splits
# Create splits ensuring no author overlap
train_df, val_df, test_df = create_data_splits(
    df_raw,
    train_size=config['train_size'],
    val_size=config['val_size'],
    test_size=config['test_size'],
    random_state=config['random_state']
)

print("Dataset splits:")
print(f"Train: {len(train_df)} samples, {train_df['author'].nunique()} authors")
print(f"Val:   {len(val_df)} samples, {val_df['author'].nunique()} authors")
print(f"Test:  {len(test_df)} samples, {test_df['author'].nunique()} authors")

Dataset splits:
Train: 2002207 samples, 164903 authors
Val:   457922 samples, 35336 authors
Test:  539119 samples, 35337 authors


In [8]:
# Save Processed Data
# Save splits to parquet
for split_name, split_df in [
    ('train', train_df),
    ('val', val_df),
    ('test', test_df)
]:
    output_path = PROCESSED_DATA_DIR / f'{split_name}.parquet'
    split_df.to_parquet(output_path, index=False)
    print(f"Saved {split_name} split to {output_path}")



Saved train split to data/processed/train.parquet
Saved val split to data/processed/val.parquet
Saved test split to data/processed/test.parquet


In [18]:
# Resample datasets by selecting 10% of authors and keeping all their posts
train_authors = train_df['author'].unique()
val_authors = val_df['author'].unique() 
test_authors = test_df['author'].unique()

train_sampled_authors = np.random.choice(train_authors, size=int(len(train_authors)*0.1), replace=False)
val_sampled_authors = np.random.choice(val_authors, size=int(len(val_authors)*0.1), replace=False)
test_sampled_authors = np.random.choice(test_authors, size=int(len(test_authors)*0.1), replace=False)

train_df = train_df[train_df['author'].isin(train_sampled_authors)]
val_df = val_df[val_df['author'].isin(val_sampled_authors)]
test_df = test_df[test_df['author'].isin(test_sampled_authors)]

print("\nAfter resampling to 10% of authors:")
print(f"Train: {len(train_df)} samples, {train_df['author'].nunique()} authors")
print(f"Val:   {len(val_df)} samples, {val_df['author'].nunique()} authors") 
print(f"Test:  {len(test_df)} samples, {test_df['author'].nunique()} authors")

# Save resampled splits
for split_name, split_df in [
    ('train', train_df),
    ('val', val_df), 
    ('test', test_df)
]:
    output_path = PROCESSED_DATA_DIR / f'{split_name}_small.parquet'
    split_df.to_parquet(output_path, index=False)
    print(f"Saved resampled {split_name} split to {output_path}")



After resampling to 10% of authors:
Train: 17838 samples, 1649 authors
Val:   5044 samples, 353 authors
Test:  3798 samples, 353 authors
Saved resampled train split to data/processed/train_small.parquet
Saved resampled val split to data/processed/val_small.parquet
Saved resampled test split to data/processed/test_small.parquet
