In [20]:
# declare a list tasks whose products you want to use as inputs
upstream = None


In [1]:
# Imports and Setup
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import logging
import json
from tqdm import tqdm
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
import matplotlib.pyplot as plt
import seaborn as sns

# Add project root to path
sys.path.append(str(Path.cwd().parent))

from src.data_tools.preprocessor import load_and_clean_data
from src.data_tools.dataset import create_data_splits

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
# Configuration
# Define paths
DATA_DIR = Path('data')
PROCESSED_DATA_DIR = DATA_DIR / 'processed'

# Create directories if they don't exist
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

# Model configuration
config = {
    'min_text_length': 5,
    'train_size': 0.7,
    'val_size': 0.15,
    'test_size': 0.15,
    'random_state': 42
}

# Save config
with open(PROCESSED_DATA_DIR / 'preprocessing_config.json', 'w') as f:
    json.dump(config, f, indent=2)

print("Configuration saved!")

Configuration saved!


In [3]:
# Load Czech comment data
df_raw = load_and_clean_data(str(DATA_DIR))

df_raw = df_raw.rename(columns={'account': 'author'})
df_raw = df_raw.rename(columns={'tweet': 'text'})

print("Raw dataset info:")
print(f"Total samples: {len(df_raw)}")
print(f"Columns: {df_raw.columns.tolist()}")
print("\nMissing values:")
print(df_raw.isnull().sum())

# Show class distribution
print("\nClass distribution:")
print(df_raw['troll'].value_counts(normalize=True))

INFO:src.data_tools.preprocessor:Loading Russian troll tweets...
INFO:src.data_tools.preprocessor:Loading Sentiment140 tweets...
INFO:src.data_tools.preprocessor:Loading celebrity tweets...
INFO:src.data_tools.preprocessor:Loading manualy scraped tweets...
INFO:src.data_tools.preprocessor:Loading Twitter JSON data from non_troll_politics folder...
INFO:src.data_tools.preprocessor:Loading information operations tweets...
INFO:src.data_tools.preprocessor:Loading data collected by Machova...
INFO:src.data_tools.preprocessor:Non-troll data columns: ['is_troll', 'body']
INFO:src.data_tools.preprocessor:Troll data columns: ['is_troll', 'body']
INFO:src.data_tools.preprocessor:Combining datasets...
INFO:src.data_tools.preprocessor:Filtering accounts with few tweets...


Raw dataset info:
Total samples: 3510123
Columns: ['author', 'text', 'troll']

Missing values:
author    0
text      0
troll     0
dtype: int64

Class distribution:
troll
1    0.83017
0    0.16983
Name: proportion, dtype: float64


In [4]:
print("Available columns:", df_raw.columns.tolist())


Available columns: ['author', 'text', 'troll']


In [7]:
# Create Train/Val/Test Splits
# Create splits ensuring no author overlap
train_df, val_df, test_df = create_data_splits(
    df_raw,
    train_size=config['train_size'],
    val_size=config['val_size'],
    test_size=config['test_size'],
    random_state=config['random_state']
)

print("Dataset splits:")
print(f"Train: {len(train_df)} samples, {train_df['author'].nunique()} authors")
print(f"Val:   {len(val_df)} samples, {val_df['author'].nunique()} authors")
print(f"Test:  {len(test_df)} samples, {test_df['author'].nunique()} authors")

Dataset splits:
Train: 2407195 samples, 17033 authors
Val:   541073 samples, 3650 authors
Test:  561855 samples, 3651 authors


In [13]:
# Save Processed Data
# Save splits to parquet
for split_name, split_df in [
    ('train', train_df),
    ('val', val_df),
    ('test', test_df)
]:
    output_path = PROCESSED_DATA_DIR / f'{split_name}.parquet'
    split_df.to_parquet(output_path, index=False)
    print(f"Saved {split_name} split to {output_path}")

# Save preprocessing config
import json
config_path = PROCESSED_DATA_DIR / 'preprocessing_config.json'
with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)
print(f"\nSaved preprocessing config to {config_path}")

Saved train split to ../data/processed/train.parquet
Saved val split to ../data/processed/val.parquet
Saved test split to ../data/processed/test.parquet

Saved preprocessing config to ../data/processed/preprocessing_config.json


In [None]:


DetectorFactory.seed = 42  # Ensure consistent detection

def detect_language_safe(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

# Count comments per author and filter
author_comment_counts = df_raw['author'].value_counts()
authors_with_min_comments = author_comment_counts[author_comment_counts >= 5].index

# Aggregate comments and detect language only for filtered authors
print("Detecting language for authors with at least 5 comments...")
author_lang_df = df_raw[df_raw['author'].isin(authors_with_min_comments)] \
    .groupby('author')['text'].apply(lambda texts: ' '.join(texts)).reset_index()
author_lang_df['lang'] = author_lang_df['text'].apply(detect_language_safe)

# Merge back to original dataframe
df = df_raw.merge(author_lang_df[['author', 'lang']], on='author', how='left')

# Count and plot languages among filtered authors
author_lang_counts = author_lang_df['lang'].value_counts()
print("\nLanguage Distribution (Authors with ≥ 5 comments):")
print(author_lang_counts)

# Plot
plt.figure(figsize=(6, 4))
sns.barplot(x=author_lang_counts.values, y=author_lang_counts.index)
plt.xlabel('Number of Authors')
plt.ylabel('Detected Language')
plt.title('Detected Language of Authors (≥ 5 Comments)')
plt.tight_layout()
plt.show()

# Optional: Print Slovak author count
num_slovak_authors = (author_lang_df['lang'] == 'sk').sum()
total_authors = len(author_lang_df)
print(f"\nSlovak authors: {num_slovak_authors} / {total_authors} ({num_slovak_authors / total_authors:.2%})")

In [None]:
from pathlib import Path
import json

# Filter Slovak authors (based on aggregate comment language)
slovak_authors = author_lang_df[author_lang_df['lang'] == 'sk']['author'].tolist()

# Define output path
output_path = Path("./output/slovak_authors.json")
output_path.parent.mkdir(exist_ok=True)

# Save as JSON
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(slovak_authors, f, ensure_ascii=False, indent=2)

print(f"Saved {len(slovak_authors)} Slovak authors to {output_path}")