# Main notebook for data processing in robo_romeo project

## Imports - this should do us for the whole project. I have commented out the ones for model building later on

In [None]:
import numpy as np
from PIL import Image
import os
import string
from pickle import dump
from pickle import load
import tensorflow as tf
# from tensorflow.keras.applications.xception import Xception #to get pre-trained model Xception
from tensorflow.keras.applications.xception import preprocess_input
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer #for text tokenization
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
# from tensorflow.keras.layers.merge import add
# from tensorflow.keras.models import Model, load_model
# from tensorflow.keras.layers import Input, Dense#Keras to build our CNN and LSTM
# from tensorflow.keras.layers import LSTM, Embedding, Dropout
from tqdm import tqdm_notebook as tqdm #to check loop progress
tqdm().pandas()

## Data cleaning

 - load_fp( filename ) – To load the document file and read the contents of the file into a string.

In [None]:
# Load the document file into memory
def load_doc(filename):
    # Open file to read
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

 - img_capt( filename ) – To create a description dictionary that will map images with all 5 captions.

In [None]:
# get all images with their captions
def img_capt(filename):
    file = load_doc(filename)
    captions = file.split('n')
    descriptions ={}
    for caption in captions[:-1]:
        img, caption = caption.split('t')
        if img[:-2] not in descriptions:
            descriptions[img[:-2]] = [ caption ]
        else:
            descriptions[img[:-2]].append(caption)
    return descriptions

 - txt_cleaning( descriptions) – to clean the data by taking all descriptions as input. This will perform several types of cleaning including uppercase to lowercase conversion, punctuation removal, and removal of the number containing words.



In [None]:
#Data cleaning function to convert all upper case alphabets to lowercase, removing punctuations and words containing numbers
def txt_clean(captions):
    table = str.maketrans('','',string.punctuation)
    for img,caps in captions.items():
        for i,img_caption in enumerate(caps):
            img_caption.replace("-"," ")
            descp = img_caption.split()
          #uppercase to lowercase
            descp = [wrd.lower() for wrd in descp]
          #remove punctuation from each token
            descp = [wrd.translate(table) for wrd in descp]
          #remove hanging 's and a
            descp = [wrd for wrd in descp if(len(wrd)>1)]
          #remove words containing numbers with them
            descp = [wrd for wrd in descp if(wrd.isalpha())]
          #converting back to string
            img_caption = ' '.join(desc)
            captions[img][i]= img_caption
    return captions

 - txt_vocab( descriptions ) – to create a vocabulary from all the unique words extracted out from descriptions.



In [None]:
def txt_vocab(descriptions):
  # To build vocab of all unique words
    vocab = set()
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]
    return vocab

 - save_descriptions( descriptions, filename ) – This function is used to store all the preprocessed descriptions into a file.



In [None]:
!pwd

In [None]:
#To save all descriptions in one file
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + 't' + desc )
            data = "n".join(lines)
            file = open(filename,"w")
            file.write(data)
            file.close()
            # Saving to the path of the project folder
            dataset_text = "/Users/ChrisKarg/code/CMaxK/robo_romeo/raw_data/Flickr8k_text"
            dataset_images = "/Users/ChrisKarg/code/CMaxK/robo_romeo/raw_data/Flicker8k_Dataset"
            #to prepare text data
            filename = dataset_text + "/" + "Flickr8k.token.txt"
            #loading the file that contains all data
            #map them into descriptions dictionary 
            descriptions = img_capt(filename)
            print("Length of descriptions =" ,len(descriptions))
            #cleaning the descriptions
            clean_descriptions = txt_clean(descriptions)
            #to build vocabulary
            vocabulary = txt_vocab(clean_descriptions)
            print("Length of vocabulary = ", len(vocabulary))
            #saving all descriptions in one file
            save_descriptions(clean_descriptions, "descriptions.txt")

# loading dataset for model training

Flickr_8k.trainImages.txt contains a list of 6000 image names that are used for training

Functions required to load the training datasets:

 - load_photos( fname ) – takes a file name as a parameter and return the list of image names by loading the text file into a string.

 - load_clean_descriptions( fname, image) – stores the captions for every image from the list of photos to a dictionary. For the ease of the LSTM model in identifying the beginning and ending of a caption, we append the and identifier with each caption. ('start' and 'end' tags at the beginning and end of each caption.

 - load_features(photos) – to return the extracted feature vectors from the Xception model and the dictionary for photos.

In [None]:
#load the data
def load_photos(filename):
    file = load_doc(filename)
    photos = file.split("n")[:-1]
    return photos

In [None]:
def load_clean_descriptions(filename, photos):
    #loading clean_descriptions
    file = load_doc(filename)
    descriptions = {}
    for line in file.split("n"):
        words = line.split()
        if len(words)<1 :
            continue
        image, image_caption = words[0], words[1:]
        if image in photos:
            if image not in descriptions:
                descriptions[image] = []
                desc = ' ' + " ".join(image_caption) + ' '
                descriptions[image].append(desc)
    return descriptions

In [None]:
def load_features(photos):
    #loading all features
    all_features = load(open("features.p","rb"))
    #selecting only needed features
    features = {k:all_features[k] for k in photos}
    return features

In [None]:
filename = dataset_text + "/" + "Flickr_8k.trainImages.txt"
#train = loading_data(filename)
train_imgs = load_photos(filename)
train_descriptions = load_clean_descriptions("descriptions.txt", train_imgs)
train_features = load_features(train_imgs)