In [None]:
import requests
import pandas as pd
from time import sleep
from IPython.display import clear_output

In [None]:
def getData(continueVariable, startDate, endDate):
    global S
    # continueVariable is a key containing the next set of records
    if(continueVariable != ''):
        PARAMS = {
            "action": "query",
            "format": "json",
            "list": "logevents",
            "lelimit": "500",
            "letype":"block",
            "lestart":startDate,
            "leend":endDate,
            "ledir":"newer",
            "lecontinue" : continueVariable
        }
        
        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()
        if 'continue' in DATA:
            continueVariable = DATA['continue']['lecontinue']
        else:
            continueVariable = ''
        LOGS = DATA["query"]["logevents"]
        return LOGS, continueVariable
    
    else:
        # If end of records then return custom -1 and empty string
        return -1,''       

In [None]:
def updateCSV(year, data):
    # Make data frame of above data
    df = pd.DataFrame.from_records(data)
    
    # append data frame to CSV file
    file_path = 'scrapped_data'
    file_name = f'{file_path}/{year}.csv'
    df.to_csv(file_name, mode='a', index=False)


In [None]:
def firstCall(startDate, endDate, year):
    global S
    global URL

    PARAMS = {
        "action": "query",
        "format": "json",
        "list": "logevents",
        "lelimit": "500",
        "letype":"block",
        "lestart":startDate,
        "leend":endDate,
        "ledir":"newer"
    }

    R = S.get(url=URL, params=PARAMS)

    DATA = R.json()

    LOGS = DATA["query"]["logevents"]
    continueVariable = DATA['continue']['lecontinue']
    # Initialize to a local csv
    df = pd.DataFrame.from_records(LOGS)
    file_path = 'scrapped_data'
    file_name = f'{file_path}/{year}.csv'
    df.to_csv(file_name, mode='a', index=False)
    return LOGS, continueVariable

In [None]:
def getAllYearsData(years):
  for year in date_range:
    startDate = str(year)+'-01-01T00:00:00Z'
    endDate = str(year+1)+'-01-01T00:00:00Z'

    LOGS, continueVariable = firstCall(startDate, endDate, year)

    count = len(LOGS)
    while(continueVariable != ''):
        newLogs, continueVariable = getData(continueVariable, startDate, endDate)
        count+= len(newLogs)
        if(continueVariable == ''):
            print(f'{year} DONE! Total Records fetched {count}')
            break
        updateCSV(year,newLogs)


In [None]:
date_range = [*range (2004, 2018, 1)]
S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"

In [None]:
# data scrapping requires stable internet environment and a long time
getAllYearsData(date_range)

In [None]:
# if you want to use our scrapped data, you can download them from the google folder that we present in readme
# Additionally, please make sure you change the following path from 'scrapped_data' to 'scrapped_data_0'

path = 'scrapped_data'
df = pd.DataFrame()
for data in range(2004,2018):
  csvData = pd.read_csv(f'{path}/{data}.csv', on_bad_lines='skip')
  print(data)
  if data in [2007, 2008, 2009]:
    csvData = csvData.drop('actionhidden', axis=1)
    csvData = csvData.drop('suppressed', axis=1)
  data = pd.DataFrame(csvData)
  frames = [df, data]

  df = pd.concat(frames)

In [None]:
# have a look at the data scrapped
df.info()

In [None]:
# have a look at the data scrapped
df.head()

In [None]:
# preprocess the scrapped data(simple and limited)

df['params'] = df['params'].replace("'", '"', regex=True)
# Extract 'duration'
df['duration'] = df['params'].str.extract(r'"duration":\s*"([^"]+)"', expand=False)
# Extract 'flags'
df['flags'] = df['params'].str.extract(r'"flags":\s*\[([^\]]+)\]', expand=False)
df['flags'] = df['flags'].str.split(',')
# Extract 'expiry'
df['expiry'] = df['params'].str.extract(r'"expiry":\s*"([^"]+)"', expand=False)
df['expiry'] = pd.to_datetime(df['expiry'], errors='coerce')

#process the 'title' column
df['title'] = df['title'].str.replace('User:', '')

# Convert 'timestamp' to datetime, handling errors and NaN values
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

# Filter out rows where 'timestamp' is not a valid date-time string
df = df.dropna(subset=['timestamp'])

# Format 'timestamp' to the desired string format
df['timestamp'] = df['timestamp'].dt.strftime('%Y-%m-%d')

# Display the modified DataFrame
print("Modified DataFrame:")
print(df)


In [None]:
# have a look at the modifies dataframe
df.head()

In [None]:
#keep only the columns that we are interested in
df_cleaned = df[["title", "action", "user", "timestamp", "comment", "duration", "flags", "expiry"]]

In [None]:
#have a look at the dataframe cleaned
df_cleaned

In [None]:
# check the distribution of duration
result = df_cleaned[df_cleaned["action"] == 'block']["duration"].value_counts()
pd.set_option('display.max_rows', None)

# Display the complete results
print("Value Counts for 'duration' where 'action' is 'block':")
print(result)

# Reset display option to the default value 
pd.reset_option('display.max_rows')

In [None]:
output_path = 'data_original/block_log_data_new.csv'
df_cleaned.to_csv(output_path, index=False)