In [None]:
import spacy
import os
import numpy as np
import pandas as pd
from google.colab import drive
import requests

In [None]:
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/CAPSTONE')
os.getcwd()

Mounted at /content/drive


'/content/drive/MyDrive/CAPSTONE'

In [None]:
os.listdir()

['Will_1k_set_1.csv', 'data']

In [None]:
# API only allows 1,000 observations to be fetched in one sitting
# combine different csv files into one large data frame

dfs = []
for i in range(1, 26):
    filename = f"Will_1k_set_{i}.csv"
    df = pd.read_csv(filename)
    dfs.append(df)
combined_df = pd.concat(dfs, ignore_index=True)
combined_df.to_csv("Will_set_merged.csv", index=False)

In [None]:
# import csv file, drop irrelevant column
#data = pd.read_csv('Will_1k_set_1.csv')
data = pd.read_csv('Will_set_merged.csv')
data.drop('Unnamed: 0', axis=1, inplace=True)
data.head()

Unnamed: 0,file_name,wiki_commons_id,wiki_data_id,depicts,description,parsed caption,commons_description
0,File:Plac Wielkopolski w Poznaniu - czerwiec 2...,60332930,['Q11819181'],['Wielkopolski Square in Poznań'],"['city square in Poznań, Poland']",Plac Wielkopolski w Poznaniu.\n\n,"{'DateTime': {'value': '2017-06-24 03:53:38', ..."
1,File:2018-10-10 Gymnastics at 2018 Summer Yout...,77382844,['Q93888038'],['Yin Dehang'],['Chinese artistic gymnast'],Horizontal bar qualification of the boys' arti...,"{'DateTime': {'value': '2019-03-16 21:07:55', ..."
2,File:CP 00118destructions rue de Tournai 1914.jpg,84142136,['Q39074508'],['Siege of Lille'],['1914 siege'],Lille (France - Nord department) - Destruction...,"{'DateTime': {'value': '2019-11-19 16:24:13', ..."
3,"File:Waycross, Georgia Historic District (40).jpg",86478066,['Q30622481'],['Waycross Historic District'],"['historic district in Waycross, Georgia']","Waycross Historic District, Waycross, Georgia\...","{'DateTime': {'value': '2020-02-01 01:11:58', ..."
4,File:J31 867 Kraftwerkskaverne.jpg,138279755,['Q963375'],['diesel generator'],['combination of a diesel engine with an elect...,Zwei Dieselgeneratoren für die Gesamtversorgun...,"{'DateTime': {'value': '2023-07-16 20:54:40', ..."


In [None]:
# verify that all csv files were indeed added together
data.shape[0]

## Original Spacy Model

In [None]:
# Load the English language model for SpaCy
nlp = spacy.load("en_core_web_sm")

def extract_tags(description):
  """
  This function takes an image description as input and extracts potential tags.

  Args:
      description (str): The text description of the image.

  Returns:
      list: A list of potential tags extracted from the description.
  """
  # Preprocess the text
  description = description.lower()
  description = description.strip() # Remove leading/trailing whitespaces

  # Perform NLP with spaCy
  doc = nlp(description)

  # Extract potential tags
  tags = []
  prev_token = None
  for token in doc:
    if token.pos_ in ["NOUN", "PROPN"]: # Consider nouns and proper nouns
      if prev_token and prev_token.pos_ in ["NOUN", "PROPN"]:
        # Combine the current and previous token for multi-word tags
        tags.append(prev_token.text + " " + token.text)
      else:
        tags.append(token.text)
    prev_token = token

  return tags

In [None]:

# Extracting a single string, test usage
image_description = "A photo of a Golden Gate Bridge at sunset, with sailboats in the foreground."
tags = extract_tags(image_description)

print("Extracted tags:", tags)

Extracted tags: ['photo', 'golden', 'golden gate', 'gate bridge', 'sunset', 'sailboats', 'foreground']


In [None]:
#######
#######
#######
#######

In [None]:
# Download larger model (medium), has to be repeated every time Colab session is opened
!python -m spacy download en_core_web_md

['Collecting en-core-web-md==3.7.1',
 '  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)',
 '\x1b[?25l     \x1b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m0.0/42.8 MB\x1b[0m \x1b[31m?\x1b[0m eta \x1b[36m-:--:--\x1b[0m',
 '\x1b[2K     \x1b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m0.2/42.8 MB\x1b[0m \x1b[31m7.2 MB/s\x1b[0m eta \x1b[36m0:00:06\x1b[0m',
 '\x1b[2K     \x1b[91m╸\x1b[0m\x1b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m0.8/42.8 MB\x1b[0m \x1b[31m11.3 MB/s\x1b[0m eta \x1b[36m0:00:04\x1b[0m',
 '\x1b[2K     \x1b[91m━\x1b[0m\x1b[90m╺\x1b[0m\x1b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m1.4/42.8 MB\x1b[0m \x1b[31m13.5 MB/s\x1b[0m eta \x1b[36m0:00:04\x1b[0m',
 '\x1b[2K     \x1b[91m━━\x1b[0m\x1b[90m╺\x1b[0m\x1b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m2.2/42.8 MB\x1b[0m \x1b[31m15.8 MB/s\x1b[0m eta \x1b[36m0:00:03\x1

## SpaCy update to remove articles from tags

In [None]:
import spacy

# Load the SpaCy model
nlp = spacy.load("en_core_web_md")

# Example text from an image description
text = "The photo shows the Eiffel Tower at sunset, surrounded by tourists taking photographs."

# Process the text
doc = nlp(text)

def remove_articles(phrase):
    # List of articles to remove
    articles = ['the', 'a', 'an']
    words = phrase.split()
    # Remove the first word if it is an article
    if words[0].lower() in articles:
        return ' '.join(words[1:])
    return phrase

# Extract noun chunks as potential tags
noun_chunks = [remove_articles(chunk.text) for chunk in doc.noun_chunks]

# Extract named entities as specific tags
named_entities = [remove_articles(entity.text) for entity in doc.ents]

In [None]:
tags = list(set(noun_chunks + named_entities))

print("Extracted Tags:", tags)

Extracted Tags: ['Eiffel Tower', 'photographs', 'photo', 'sunset', 'tourists']


## Test - Stemming:

In [None]:
# Process the text
doc2 = nlp(text2)

# Extract noun chunks as potential tags
noun_chunks2 = [remove_articles(chunk.text) for chunk in doc2.noun_chunks]

# Extract named entities as specific tags
named_entities2 = [remove_articles(entity.text) for entity in doc.ents]
tags2 = list(set(noun_chunks2 + named_entities2))

print("Extracted Tags:", tags2)

Extracted Tags: ['sailboats', 'Eiffel Tower', 'Golden Gate Bridge', 'photo', 'sunset', 'foreground']


In [None]:
#####
#####
#####

In [None]:
# download large SpaCy Model for testing purposes
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## Incorporate Lemmatization

In [None]:

# Load the SpaCy model
nlp = spacy.load("en_core_web_lg")

# Example text from an image description
#text = "['Waycross Historic District']"
text = "['Horizontal bar qualification of the boys' artistic gymnastics at the 2018 Summer Youth Olympics in Buenos Aires on 10 October 2018. Depicted: Yin Dehang.']"
# A photo of a Golden Gate Bridge at sunset, with sailboats in the foreground.

# Process the text with SpaCy
doc = nlp(text)

# Function to remove leading articles and lemmatize
def clean_and_lemmatize(phrase, doc):
    articles = ['the', 'a', 'an']
    words = phrase.split()
    # Remove the first word if it is an article
    if words[0].lower() in articles:
        words = words[1:]
    # Lemmatize each word in the phrase
    lemmatized_words = []
    for word in words:
        # Find the token for this word (use the original text for matching)
        tokens = [token for token in doc if token.text.lower() == word.lower()]
        if tokens:
            # Use the lemma of the first matching token
            lemma = tokens[0].lemma_
        else:
            lemma = word  # Fallback to the original word if not found
        lemmatized_words.append(lemma)
    return ' '.join(lemmatized_words)

# Extract noun chunks and clean them
noun_chunks = [clean_and_lemmatize(chunk.text, doc) for chunk in doc.noun_chunks]

# Extract named entities, clean and lemmatize
named_entities = [clean_and_lemmatize(entity.text, doc) for entity in doc.ents]

# Combine both sets of tags, removing duplicates
tags = list(set(noun_chunks + named_entities))

print("Extracted Tags:", tags)

Extracted Tags: ['2018 Summer Youth Olympics', '2018', 'Summer Youth Olympics', '10 October', "boys' artistic gymnastic", 'Buenos Aires', "['Horizontal bar qualification", 'Yin Dehang', '10 October 2018']


In [None]:

'''
def clean_and_lemmatize(phrase, doc):
    articles = ['the', 'a', 'an']
    words = phrase.split()
    if words and words[0].lower() in articles:
        words = words[1:]
    lemmatized_words = []
    for word in words:
        tokens = [token for token in doc if token.text.lower() == word.lower()]
        if tokens:
            lemmatized_words.append(tokens[0].lemma_)  # Append lemma to the list
        else:
            lemmatized_words.append(word)  # Append original word if no token found
    return ' '.join(lemmatized_words)
  '''

def clean_and_lemmatize(phrase, doc):
    articles = ['the', 'a', 'an']
    words = phrase.split()
    # Remove the first word if it is an article
    if words[0].lower() in articles:
        words = words[1:]
    # Lemmatize each word in the phrase
    lemmatized_words = []
    for word in words:
        # Find the token for this word (use the original text for matching)
        tokens = [token for token in doc if token.text.lower() == word.lower()]
        if tokens:
            # Use the lemma of the first matching token
            lemma = tokens[0].lemma_
        else:
            lemma = word  # Fallback to the original word if not found
        lemmatized_words.append(lemma)
    return ' '.join(lemmatized_words)

def process_text(text):
    stripped_text = text.strip("['").strip("']")
    doc = nlp(stripped_text)
    #noun_chunks = [clean_and_lemmatize(chunk.text, doc) for chunk in doc.noun_chunks]
    noun_chunks = [clean_and_lemmatize(chunk.text, doc) for chunk in doc.noun_chunks]
    named_entities = [clean_and_lemmatize(entity.text, doc) for entity in doc.ents]
    return list(set(noun_chunks + named_entities))

# Apply the function to the DataFrame
results = data['parsed caption'].apply(process_text)
print(results)

0                        [Poznań, Wielkopolski Square]
1                                         [Yin Dehang]
2                                       [siege, Lille]
3      [Historic District, Waycross Historic District]
4                                   [diesel generator]
                            ...                       
995                                       [Sam Arnold]
996                                          [Končiny]
997                                              [Sun]
998                       [Kilbarchan railway station]
999                                   [theale, mining]
Name: depicts, Length: 1000, dtype: object


In [None]:
# extract single observation
results[115]

['Grote Kerk']

In [None]:
# TO-DO-NEXT:
# ground truth = depicts column
#   --> if tag appears anywhere in depicts, it's a hit
#       tags don't have to match depicts statement because of inconsistencies

In [2]:
test = ['2018 Summer Youth Olympics', '2018', 'Summer Youth Olympics', '10 October', "boys' artistic gymnastic", 'Buenos Aires', "['Horizontal bar qualification", 'Yin Dehang', '10 October 2018']

In [3]:
test

['2018 Summer Youth Olympics',
 '2018',
 'Summer Youth Olympics',
 '10 October',
 "boys' artistic gymnastic",
 'Buenos Aires',
 "['Horizontal bar qualification",
 'Yin Dehang',
 '10 October 2018']

In [20]:
def get_qnums(tag_list):
    qnum_dict = {}
    for s in tag_list:
        try:
            url = f'https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&titles={s}&languages=en&props=labels&format=json'
            response = requests.get(url).json()
            qnum = next(iter(response['entities']))
            if qnum.startswith('Q'):
                qnum_dict[s] = qnum
            else:
                #qnum_dict[s] = 'None'
                pass
        except:
            qnum_dict[s] = None
    # qnum_list = list(qnum_dict.values())
    # return qnum_list
    return qnum_dict

In [12]:
test_qnums = get_qnums(test)

In [13]:
test_qnums

{'2018 Summer Youth Olympics': 'Q870879',
 '2018': 'Q25291',
 'Buenos Aires': 'Q1486',
 'Yin Dehang': 'Q93888038'}

In [19]:
q_num_list = list(test_qnums.values())
q_num_list

['Q870879', 'Q25291', 'Q1486', 'Q93888038']

In [18]:
tag_suggestions = list(test_qnums.keys())
tag_suggestions

['2018 Summer Youth Olympics', '2018', 'Buenos Aires', 'Yin Dehang']