In [85]:
# import libraries
import os
import csv
import numpy as np
import pandas as pd
from collections import defaultdict

In [86]:
# path to read output of Extract genome
PATH = "/nas/home/slnagark/Genome/output.tsv"
# path to write into generative.tsv
PATH_TSV = "/nas/home/slnagark/Genome/"

def load_data():
    """
    input: output.tsv from extract genome
    output: dataframe with image id, region id, subject name, object name, subject synset, object synset, and sentence
    
    """
    tsv = open(PATH)
    data = pd.read_csv(tsv, sep="\t")

    my_data = data[['image_id','region_id','subject', 'object', 'subject_synset', 'object_synset', 'sentence']].copy()
    return my_data

def process_sentences(s1, s2, flag):
    """
    input:  s1 - sentence 1
            s2 - sentence 2
            flag - condition of sentence join
    
    output: s1 joined with s2
    """
    
    if flag == 1:
        return s1+" "+s2
    else:
        return s1+" and "+s2

def get_phraseInfo(my_data):
    """
    Iters through the dataframe to:
      1. store a dictionary mapping of subjects to their corresponding sentences.
      2. store a dictionary mapping of sentences to their corresponding information.
         Information is in the format: (object_synset, subject_synset, object_name, subject_name)
    
    input: dataframe obtained from load_data()
    output: sub2phrase: suject to sentence mappings
            phrase2info: sentence to information mappins
    
    """
    sub2phrase = defaultdict(list)
    phrase2info = defaultdict(list)
    
    for index, row in my_data.iterrows():

        subject_synset = row['subject_synset']
        object_synset = row['object_synset']
        phrase = row['sentence']
        
        # add information only if it is not present in the phrase2info dictionary
        if (object_synset, subject_synset, row['object'], row['subject']) not in phrase2info[phrase]:
            phrase2info[phrase].append((object_synset, subject_synset, row['object'], row['subject']))
        
        # add new sentences having the same subject in the sub2phrase dictionary 
        if subject_synset in sub2phrase.keys() and phrase not in sub2phrase[subject_synset]:
            sub2phrase[subject_synset].append(phrase)
        # add sentences only if they are not present in the sub2phrase dictionary
        elif subject_synset not in sub2phrase.keys():
            sub2phrase[subject_synset].append(phrase)
    return sub2phrase, phrase2info


def generate_tsv(sub2phrase, phrase2info):
    """
    Writes to file the unique sentences and their combinations along with concept 
    strings in the format (s1 subject, s1 object, s2 object)
    
    input: subject to sentences mappings and phrase to information mappings
    output: generative.tsv with columns: Sentence 1, Sentence 2, Combined, Concept Strings, Concept Synsets
    
    """
    
    # combi dictionary stores sentence information as follows: 
    # (s1 object synset, s1 subject synset, s2 object synset, s2 subject synset, s1 object name, s1 subject name, s2 object name, s2 subject name)
    combi = defaultdict(list)
    
    with open(os.path.join(PATH_TSV,'generative.tsv'), 'wt') as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t', lineterminator='\n')
        tsv_writer.writerow(['Sentence 1', 'Sentence 2', 'Combined', 'Concept Strings', 'Concept Synsets'])

        for index, row in my_data.iterrows():

            obj_synset = row['object_synset']
            sub_synset = row['subject_synset']         
            s1 = row['sentence']                              
            
            # format of phrases in phrases2info : [object_syns, subject_syns, object name, subject name]

            if obj_synset in sub2phrase.keys():
                for s2 in sub2phrase[obj_synset]:
                    for phrases in phrase2info[s2]:
                        # object of s2 != subject of s1 and object of s1 = subject of s2
                        if sub_synset != phrases[0] and phrases[1]==obj_synset  :

                            combined = process_sentences(s1, s2, 1)

                            info = (obj_synset, sub_synset, phrases[0], phrases[1],
                                    row['object'], row['subject'], phrases[2], phrases[3])
                            
                            # to handle repetitive sentences
                            if info not in combi[combined]:
                                combi[combined].append(info)
                                c_string = [info[5], info[4], info[6]] 
                                c_syns = [info[1], info[0], info[2]] 

                                tsv_writer.writerow([str(s1), str(s2), str(combined), str(c_string), str(c_syns)])

            if sub_synset in sub2phrase.keys():
                for s2 in sub2phrase[sub_synset]:
                    for phrases in phrase2info[s2]: 
                        # object of s1 != object of s2 subject of s1 != object of s2 and subhect of s1 = subject of s2
                        if obj_synset != phrases[0] and sub_synset != phrases[0] and sub_synset == phrases[1]:

                            combined = process_sentences(s1, s2, 2)

                            info = (obj_synset, sub_synset,phrases[0],phrases[1],
                                                    row['object'],row['subject'] ,phrases[2],phrases[3])
                            if info not in combi[combined]:
                                combi[combined].append(info)
                                c_string = [info[5], info[4], info[6]] 
                                c_syns = [info[1], info[0], info[2]] 

                                tsv_writer.writerow([str(s1), str(s2), str(combined), str(c_string), str(c_syns)])
        out_file.close()
        print("File generated successfully!")

def main():
    """
    loads data from extract genome and generates generative.tsv file
    """
    my_data = load_data()
    sub2phrase, phrase2info = get_phraseInfo(my_data)
    generate_tsv(sub2phrase, phrase2info)


In [87]:
if __name__ == "__main__":
    main()

File generated successfully!
