In [9]:
from typing import List, Optional

import pandas as pd
from fastparquet import ParquetFile

# Utils

In [4]:
def filter_contains_any_word(data_list: List[str], words: List[str]) -> Optional[List[str]]:
    if (data_list is None) or (words is None):
        return None
    filtered_data_list = []
    for word in words:
        for data in data_list:
            if word in data:
                filtered_data_list.append(data)
    filtered_data_list = filtered_data_list if len(filtered_data_list) > 0 else None 
    return filtered_data_list

# Load data

In [12]:
all_users_df = ParquetFile("all_users_edits.parquet").to_pandas()

In [26]:
wiki_pages_df = pd.read_csv('wiki_pages.csv')

In [30]:
print(wiki_pages_df.iloc[5:10])

                   page_name  \
5          "O"-Jung.Ban.Hap.   
6  "The Spaghetti Incident?"   
7               "אבא גוריון"   
8                 "אינטגרנד"   
9          "אני רופא, לא..."   

                                            page_url  \
5  https://he.wikipedia.org/wiki/%22O%22-Jung.Ban...   
6  https://he.wikipedia.org/wiki/%22The_Spaghetti...   
7  https://he.wikipedia.org/wiki/%22%D7%90%D7%91%...   
8  https://he.wikipedia.org/wiki/%22%D7%90%D7%99%...   
9  https://he.wikipedia.org/wiki/%22%D7%90%D7%A0%...   

      last_fetch_page_name_ts  
5  2023-09-02 21:23:19.138335  
6  2023-09-02 21:23:19.139723  
7  2023-09-02 21:23:19.141204  
8  2023-09-02 21:23:19.142446  
9  2023-09-02 21:23:19.143879  


# Constants

In [92]:
kohelet_names = ["פורום קוהלת", "פורום קהלת"]

kohelet_users = [
    "meirneria", "sb236", "meir neria", "אגלי טל", "אגלי טל קהלת", "פורום קהלת", "פ. קהלת", "אדם ס.", "חדש תחת השמש", "סבורני", 
    "שחמ-טאי1", "בן קיש", "amoshav", 'ראב"ד1785', "פורום קוהלת", "סבורני", "enoshAn"
                 ]
kohelet_suspected_users = ["armorredknight"] # page name: {"ArmorredKnight": "ליברליזם"}

# Analysis

In [93]:
kohelet_mask_1 = all_users_df['user_name'].apply(lambda user: user.lower() if user else user).isin(kohelet_users)
print(f"Found {kohelet_mask_1.sum()} Kohelet users")

kohelet_df = all_users_df[kohelet_mask_1]

print(f"Number of unique page that kohelet edited: {len(kohelet_df['page_name'].unique())}")

Found 906 Kohelet users
Number of unique page that kohelet edited: 817


In [7]:
all_users_df['kohelet_talk'] = all_users_df['user_talk'].apply(filter_contains_any_word, words = kohelet_names)

In [8]:
kohelet_talk_df = all_users_df.dropna(subset=['kohelet_talk'])

In [36]:
not_in_kohelet_users_mask = kohelet_talk_df['user_name'].apply(lambda x: x not in kohelet_users)
duplicated_kohelet_talk  = kohelet_talk_df['kohelet_talk'].apply(str).duplicated()
suspected_kohelet_df = kohelet_talk_df[not_in_kohelet_users_mask & ~duplicated_kohelet_talk]

---