In [None]:
import pandas as pd
import re
import glob
from nltk.tokenize import word_tokenize

In [None]:
# --- Stopwords & Preprocessing ---
georgian_stopwords = [
    "და", "ან", "არის", "რომ", "ეს", "ამ", "იმ", "თუ", "არ", "ვერ", 
    "მე", "შენ", "ის", "ჩვენ", "თქვენ", "მათ", "ისინი", "რა", "როგორც", 
    "მისი", "ჩემი", "შენი", "მისი", "ჩვენი", "თქვენი", "მათი", 
    "იყო", "იქნება", "თვის", "არიან", "მქონე", "აქვს", "იგი",
    "ასევე", "უნდა", "კიდევ", "ყველა", "ერთი", "ორი", "სხვა",
    "რომელიც", "რომელსაც", "როგორ", "სადაც", "როდესაც", "ის", "რაც", "მიერ"
]

def georgian_stemmer(word):
    """Remove common Georgian suffixes to normalize words."""
    suffixes = ['ება', 'ების', 'მა', 'ში', 'ზე', 'თან', 'ით', 'დან', 'ზეა', 'ობით', 'შია']
    for suffix in suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

def preprocess_text(text):
    """Normalize and clean Georgian text: lowercase, remove HTML/punctuation, tokenize, stem."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'</?[^>]+>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    words = word_tokenize(text)
    processed_words = []
    for word in words:
        if word not in georgian_stopwords and len(word) > 2:
            stemmed = georgian_stemmer(word)
            processed_words.append(stemmed)
    return ' '.join(processed_words)

def extract_sentences_around_keyword(text, keyword, window_size=3):
    if not isinstance(text, str):
        return ""
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    indices = [i for i, s in enumerate(sentences) if keyword.lower() in s.lower()]
    contexts = []
    for idx in indices:
        start = max(0, idx - window_size)
        end = min(len(sentences), idx + window_size + 1)
        contexts.append(' '.join(sentences[start:end]))
    return ' '.join(contexts)

def remove_keyword_from_text(text, keyword):
    if not isinstance(text, str):
        return ""
    escaped_keyword = re.escape(keyword)
    text = re.sub(escaped_keyword, '', text, flags=re.IGNORECASE)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --- Data Loading ---
def load_and_combine_parquet_files(file_pattern, keyword, window_size=3, remove_keyword=True):
    """Load parquet files, extract contexts around keyword, optionally remove keyword."""
    all_files = sorted(glob.glob(file_pattern))
    if not all_files:
        raise ValueError(f"No files match pattern: {file_pattern}")
    dfs = []
    for file_path in all_files:
        df = pd.read_parquet(file_path)
        if 'full_text' not in df.columns:
            df['full_text'] = df['doc_content'] if 'doc_content' in df.columns else ""
        filtered_df = df[df['full_text'].astype(str).str.contains(keyword, na=False)].copy()
        if not filtered_df.empty:
            filtered_df['context_text'] = filtered_df['full_text'].apply(
                lambda t: extract_sentences_around_keyword(t, keyword, window_size)
            )
            if remove_keyword:
                filtered_df['context_text'] = filtered_df['context_text'].apply(
                    lambda t: remove_keyword_from_text(t, keyword)
                )
            dfs.append(filtered_df)
    if not dfs:
        raise ValueError(f"No data found containing keyword: {keyword}")
    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df = combined_df.dropna(subset=['context_text'])
    combined_df['context_text'] = combined_df['context_text'].astype(str)
    combined_df = combined_df[combined_df['context_text'] != '']
    return combined_df


def check_random_samples_from_list(text_list, keyword, n_samples=1000):
    """Select random samples from a list of texts and check if they contain the keyword."""
    import random
    
    # Adjust sample size if list is smaller than requested samples
    sample_size = min(n_samples, len(text_list))
    
    if sample_size < n_samples:
        print(f"Warning: List has only {len(text_list)} items. Using all items instead of {n_samples}.")
    
    # Take random sample
    if sample_size < len(text_list):
        random.seed(42)
        sample_texts = random.sample(text_list, sample_size)
    else:
        sample_texts = text_list.copy()
    
    samples_with_keyword = []
    samples_without_keyword = []
    
    for text in sample_texts:
        if keyword.lower() in text.lower():
            samples_with_keyword.append(text)
        else:
            samples_without_keyword.append(text)
    
    count_with_keyword = len(samples_with_keyword)
    percent_with_keyword = (count_with_keyword / sample_size) * 100
    
    print(f"Random sample results:")
    print(f"- Sample size: {sample_size}")
    print(f"- Texts containing keyword '{keyword}': {count_with_keyword} ({percent_with_keyword:.2f}%)")
    
    return percent_with_keyword, samples_with_keyword, samples_without_keyword


In [None]:
file_pattern = "corpora/corpus/split_*.parquet"  # This will match all files starting with "split_" and ending with ".parquet"
keyword = "ხელოვნური ინტელექტი"
window_size = 1  # Adjust as needed

# Load and combine the data
df = load_and_combine_parquet_files(file_pattern, keyword, window_size)


# Applying preprocessing
df['context_text'] = df['context_text'].apply(lambda text: remove_keyword_from_text(text, keyword))
df['processed_text'] = df['context_text'].apply(preprocess_text)
texts = df['processed_text'].tolist()

In [None]:
# Save to disk for later reuse
df.to_parquet('data/processed_corpus.parquet', index=False)

In [None]:
percent_with_keyword, with_keyword, without_keyword = check_random_samples_from_list(texts, keyword)

Random sample results:
- Sample size: 1000
- Texts containing keyword 'ხელოვნური ინტელექტი': 0 (0.00%)
