# How to Use

## Step 1
Run the first cell to install a package that parses the captions. It is sometimes a bit funky on Rivanna but should work with a little trial.

## Step 2
Run the second cell and check the mini_test.csv just to make sure everything is running properly.

## Step 3
Add your first name to the csv title:
`'firstname_1k_set_n.csv'`
where n is the set number you are running.

## Step 4
Change the `'firstname'` value to your first name in all the cells. Then press run on all of them at once so they execute in a row as they complete and you can just leave it in the background.

## Step 5
Upload to GH in final dataset folder.

## Notes on Data Decisions
1. Some images don't have a wikicommons description (parsed caption). If this is the case, we return a `None` type, which can turn into an NA in your DF.
2. Some WikiCommons image captions are written not in English. Most of these I've found to also not have a tag for the language they are written in. I'll keep looking into this, but this may just hurt our model.
3. I'm working on fixing link funcitonality, but right now links output as `[anchor text](link)`. I want to get it to just output the anchor text.

In [None]:
import pandas as pd
import requests
import html2text

def get_random_commons_ids(num_files=1):
    api_url = 'https://commons.wikimedia.org/w/api.php'
    api_params = {
        'action': 'query',
        'list': 'random',
        'rnnamespace': 6,  # namespace=6 is for files
        'rnlimit': num_files,
        'format': 'json'
    }

    r = requests.get(url=api_url, params=api_params)
    data = r.json()['query']['random']

    wc_page_ids = [[wc_file['id'],wc_file['title']] for wc_file in data]
    if num_files == 1:
        wc_page_ids = wc_page_ids[0]

    return wc_page_ids

#
# Function to map WikiMedia Commons uploads to their corresponding WikiData Entities
#

def get_commons_description(wc_title):
    commons_api_url = f'https://commons.wikimedia.org/w/api.php?action=query&titles={wc_title}&prop=imageinfo&iiprop=extmetadata&format=json'
    response = requests.get(commons_api_url)
    data = response.json()
    pages = data.get('query', {}).get('pages', {})
    page_id = next(iter(pages))
    metadata = pages[page_id].get('imageinfo', [{}])[0].get('extmetadata', {})
    return metadata

def get_q_number(wc_page_id, verbose=False):
    # Construct the WikiBase Entity ID
    wb_entity_id = f'M{wc_page_id}'

    # Configure API Settings for GET Request to Commons API
    api_url = 'https://commons.wikimedia.org/w/api.php'
    api_params = {
        'action': 'wbgetentities',
        'ids': wb_entity_id,
        'format': 'json',
        'props': 'claims'
    }

    # Make GET Request to Commons API
    response = requests.get(url=api_url, params=api_params)
    data = response.json()

    # Check if the entity exists and has statements
    if 'entities' not in data or wb_entity_id not in data['entities'] or 'statements' not in data['entities'][wb_entity_id]:
        return None

    data_statements = data['entities'][wb_entity_id]['statements']

    # Check if 'P180' (depicts) property exists
    if 'P180' not in data_statements or not data_statements['P180']:
        return None

    # Assuming 'P180' exists and has at least one entry
    depicts = data_statements['P180'];
    
    q_numbers = [];

    # Extract relevant information pertaining to the `Depicts` statement
    for x in range(len(depicts)):
        depicts_data = depicts[x]['mainsnak']
        # Check if 'datavalue' key exists in depicts_data
        if depicts_data and 'datavalue' in depicts_data and 'value' in depicts_data['datavalue']:
            wd_item = depicts_data['datavalue']['value']
            wd_item_id = wd_item['id']
            q_numbers.append(wd_item_id)
        else:
            return None
    if verbose:
        print('Collected')

    return q_numbers

def get_wikidata_label(wd_item_id):
    api_url = 'https://www.wikidata.org/w/api.php'
    api_params = {
        'action': 'wbgetentities',
        'ids': wd_item_id,
        'format': 'json',
        'props': 'labels',
        'languages': 'en'
    }

    r = requests.get(api_url, api_params)
    data = r.json()

    if 'entities' in data and wd_item_id in data['entities'] and 'labels' in data['entities'][wd_item_id] and 'en' in data['entities'][wd_item_id]['labels']:
        label = data['entities'][wd_item_id]['labels']['en']['value']
    else:
        return None

    return label

def get_wikidata_description(wd_item_id):
    api_url = 'https://www.wikidata.org/w/api.php'
    api_params = {
        'action': 'wbgetentities',
        'ids': wd_item_id,
        'format': 'json',
        'props': 'descriptions',
        'languages': 'en'
    }

    r = requests.get(api_url, api_params)
    data = r.json()

    if 'entities' in data and wd_item_id in data['entities'] and 'descriptions' in data['entities'][wd_item_id] and 'en' in data['entities'][wd_item_id]['descriptions']:
        description = data['entities'][wd_item_id]['descriptions']['en']['value']
    else:
        return None

    return description

#
# Define function that will construct a convenient dataframe by repeatedly making calls to the wiki commons/data APIs
# until a sufficiently large data set has been collected.
# • Note that the number of calls ≥ number of rows in returned dataframe
# • To create a CSV containing the returned dataframe:
#       df = siki_wiki(25);
#       df.to_csv('df_wiki.csv');
#

#this changes HTML link formats to just be the text of the link
class MyHTML2Text(html2text.HTML2Text):
    def handle_a(self, link, title, text):
        # Return only the text of the link
        return text

# Create an instance of your custom HTML2Text class
h = MyHTML2Text()

def siki_wiki(row_count_wanted, row_print=25):
    df_wiki = pd.DataFrame({'file_name': [], 'wiki_commons_id': [], 'wiki_data_id': [], 'depicts': [], 'description': [], 'parsed caption': []})

    df_row_count = 0
    enough_rows = False

    while not enough_rows:
        row = {'file_name': '', 'wiki_commons_id': '', 'wiki_data_id': '', 'depicts': '', 'description': '', 'parsed caption': ''}
        wc_page_ids = get_random_commons_ids(100)

        for wc_page_id in wc_page_ids:
            wd_depicts_statements = []
            #get depicts q numbers for the file
            q_numbers = get_q_number(wc_page_id[0])
            #get wikicommons description
            wc_description = get_commons_description(wc_page_id[1])
            #if there are depicts statements then we convert q numbers to depicts statements
            if q_numbers:
                wd_depicts_statements = [get_wikidata_label(x) for x in q_numbers]
            #if there is a description, we add it
            if wc_description:
                    row['commons_description'] = wc_description
            else:
                    row['commons_description'] = 'NA'
            #if there are depicts statements, then we get descriptions for those depicts vals
            if wd_depicts_statements:
                wd_descriptions = [get_wikidata_description(x) for x in q_numbers]
                if wd_descriptions:
                    row['file_name'] = wc_page_id[1]  # Assuming JPG format for simplicity
                    row['wiki_commons_id'] = wc_page_id[0]
                    row['wiki_data_id'] = q_numbers
                    row['depicts'] = wd_depicts_statements
                    row['description'] = wd_descriptions
                    if 'ImageDescription' not in wc_description or 'value' not in wc_description['ImageDescription']:
                        row['parsed caption'] = None
                    else:
                        row['parsed caption'] = h.handle(wc_description['ImageDescription']['value'])
                    df_row = pd.DataFrame.from_dict(row, orient='index').T
                    df_wiki = pd.concat([df_wiki, df_row], ignore_index=True)

                    df_row_count = df_wiki.shape[0]
                    if df_row_count%row_print==0:
                        print(df_row_count)

                    if df_row_count >= row_count_wanted:
                        enough_rows = True
                        break

    return df_wiki

# Example usage:
df = siki_wiki(5,1) #the 1 just tells it to print every submission, the default is 25
df.to_csv('mini_test.csv')

1
2


In [None]:
df = siki_wiki(1000)
df.to_csv('Will_1k_set_1.csv')

In [None]:
df = siki_wiki(1000)
df.to_csv('Will_1k_set_2.csv')

In [None]:
df = siki_wiki(1000)
df.to_csv('Will_1k_set_3.csv')

In [None]:
df = siki_wiki(1000)
df.to_csv('Will_1k_set_4.csv')

In [None]:
df = siki_wiki(1000)
df.to_csv('Will_1k_set_5.csv')

In [None]:
df = siki_wiki(1000)
df.to_csv('Will_1k_set_6.csv')