In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
import string

# **Data Loading & Formatting**

In [None]:
# Extract the raw data from the txt file.
def load_text(filename):
    file = open(filename,'r')
    text = file.read()
    file.close()
    return text

# Create a dictionary to pair (imageid,list of captions)
def load_description(doc):
    mapping = dict()
    for line in doc.split('\n'):
        tokens = line.split()
        if len(tokens)<2:
            continue
        image_id,image_desc = tokens[0],tokens[1:]
        image_id = image_id.split('.')[0]

        image_desc = ' '.join(image_desc)
        if image_id not in mapping:
            mapping[image_id] = list()
        mapping[image_id].append(image_desc)
    return mapping

In [None]:
# Define a path to the Flickr8k.token.txt file of the dataset.
tokenFile = '/content/drive/My Drive/College/IRS Innovative/flicker8k-dataset/Flickr8k_text/Flickr8k.token.txt'
doc = load_text(tokenFile)
descriptions = load_description(doc)
print('Number of samples loaded: %d ' % len(descriptions))


Loaded: 8092 


# **Data Cleaning**

In [None]:
# Pre-process the captions as follows.
# 1. take a caption and split it into words.
# 2. Convert Each word in lowercase.
# 3. Remove punctuations, if there is any.
# 4. Make sure each word contains only alphabates.
def clean_description(description):
    print(type(description))
    table = str.maketrans('','',string.punctuation)
    for key,desc_list in description.items():
        temp_list = list()
        for i in range(len(desc_list)):
            print(len(desc_list))
            desc = desc_list[i]
            desc1 = desc.split()
            desc2 = [word.lower() for word in desc1]
            desc3 = [word.translate(table) for word in desc2]
            desc4 = [word for word in desc3 if len(word)>1]
            desc5 = [word for word in desc4 if word.isalpha()]
            desc6 = ' '.join(desc5)
            temp_list.append(desc6)
        description[key]=temp_list
        
    return description
descriptions = clean_description(descriptions)

# **Data Saving**

In [None]:
# Make a list of -> imageid+' '+caption
# Join all list elements with \n character.
# and Write resultant string in a text file. 
def save_description(description,filename):
    lines = list()
    print(type(description))
    for key,desc_list in description.items():
        for desc in desc_list:
            lines.append(key+' '+desc)
    data = '\n'.join(lines)
    file = open(filename,'w')
    file.write(data)
    file.close()

In [None]:
descrOut = '/content/drive/My Drive/College/IRS Innovative/flicker8k-dataset/Flickr8k_text/description_new.txt'
save_description(descriptions, descrOut)


# **Data Analysis**

In [None]:
# It shows that how many distinct words across all captions.
def to_vocabulary(description):
    all_desc = set()

    for key in description.keys():
        [all_desc.update(d.split()) for d in description[key]]
    return all_desc

In [None]:
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))

Vocabulary Size: 8763
