In [1]:
import pandas as pd
import os
from os.path import isfile, join
import csv
import numpy as np
from skimage.io import imread, imshow
from skimage import transform
import cv2
from nltk.corpus import stopwords
#nltk.download('stopwords')
from nltk.tokenize import word_tokenize
import re

# Image Feature Extraction

###  Reading Annotation Files and Generating Majority Votes

In [2]:
#All annotation files should be placed under Annotations Folder
mypath=os.getcwd()+'/'+'Annotations'
Labelfiles = [f for f in os.listdir(mypath) if isfile(join(mypath, f))]

In [3]:
df_list=[]
all_labels= pd.DataFrame([], columns=['FileName','Lable_Tree','Lable_Mythical','Lable_Animal'])
for file in Labelfiles:
    #print(file)
    try:
        df=pd.read_excel(mypath+'/'+file, names=['FileName','Lable_Tree','Lable_Mythical','Lable_Animal'])
    except:
        pass
    all_labels=all_labels.append(df)

In [4]:
all_labels['FileName']=all_labels.apply(lambda x: x['FileName']+'.jpg' if '.jpg' not in x['FileName'] else x['FileName'],axis=1)

In [5]:
all_labels

Unnamed: 0,FileName,Lable_Tree,Lable_Mythical,Lable_Animal
0,01_binladen_part1.jpg,0,0,1
1,01_binladen_part2.jpg,1,1,0
2,01_binladen_part3.jpg,0,0,0
3,01_binladen_part4.jpg,0,0,0
4,01_binladen_part5.jpg,1,0,0
...,...,...,...,...
198,36_thegirlchild_part4.jpg,0,0,0
199,36_thegirlchild_part5.jpg,0,0,0
200,36_thegirlchild_part6.jpg,0,0,0
201,36_thegirlchild_part7.jpg,0,0,0


In [6]:
all_labels_majVotes=all_labels.groupby(['FileName']).agg({'Lable_Tree':sum,'Lable_Mythical':sum,'Lable_Animal':sum})

In [7]:
all_labels_majVotes

Unnamed: 0_level_0,Lable_Tree,Lable_Mythical,Lable_Animal
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01_binladen.jpg,8.0,8.0,8.0
01_binladen_part1.jpg,10.0,5.0,6.0
01_binladen_part2.jpg,12.0,12.0,2.0
01_binladen_part3.jpg,0.0,2.0,1.0
01_binladen_part4.jpg,0.0,1.0,1.0
...,...,...,...
36_thegirlchild_part4.jpg,1.0,1.0,0.0
36_thegirlchild_part5.jpg,0.0,1.0,0.0
36_thegirlchild_part6.jpg,0.0,3.0,0.0
36_thegirlchild_part7.jpg,0.0,1.0,0.0


In [8]:
# Calculating assigning Majority Votes to labels from all annotators
all_labels_majVotes['Lable_Tree']=np.where(all_labels_majVotes['Lable_Tree']>5,1,0)
all_labels_majVotes['Lable_Mythical']=np.where(all_labels_majVotes['Lable_Mythical']>5,1,0)
all_labels_majVotes['Lable_Animal']=np.where(all_labels_majVotes['Lable_Animal']>5,1,0)
all_labels_majVotes.reset_index(inplace=True)

In [9]:
all_labels_majVotes_parts=all_labels_majVotes[all_labels_majVotes.FileName.str.contains('_part')]

In [10]:
all_labels_majVotes_parts

Unnamed: 0,FileName,Lable_Tree,Lable_Mythical,Lable_Animal
1,01_binladen_part1.jpg,1,0,1
2,01_binladen_part2.jpg,1,1,0
3,01_binladen_part3.jpg,0,0,0
4,01_binladen_part4.jpg,0,0,0
5,01_binladen_part5.jpg,1,0,1
...,...,...,...,...
199,36_thegirlchild_part4.jpg,0,0,0
200,36_thegirlchild_part5.jpg,0,0,0
201,36_thegirlchild_part6.jpg,0,0,0
202,36_thegirlchild_part7.jpg,0,0,0


In [11]:
all_labels_majVotes_cleaned=all_labels_majVotes_parts.copy()
all_labels_majVotes_cleaned['FileName'].replace(to_replace="_part\d+", value=r"", regex=True,inplace=True)

In [12]:
all_labels_majVotes_cleaned

Unnamed: 0,FileName,Lable_Tree,Lable_Mythical,Lable_Animal
1,01_binladen.jpg,1,0,1
2,01_binladen.jpg,1,1,0
3,01_binladen.jpg,0,0,0
4,01_binladen.jpg,0,0,0
5,01_binladen.jpg,1,0,1
...,...,...,...,...
199,36_thegirlchild.jpg,0,0,0
200,36_thegirlchild.jpg,0,0,0
201,36_thegirlchild.jpg,0,0,0
202,36_thegirlchild.jpg,0,0,0


In [13]:
# Calculating and Assigning Majority Votes to Main.jpg files from all part files
all_labels_majVotes_main=all_labels_majVotes_cleaned.groupby(['FileName']).agg({'Lable_Tree':sum,'Lable_Mythical':sum,'Lable_Animal':sum})
all_labels_majVotes_main['Lable_Tree']=np.where(all_labels_majVotes_main['Lable_Tree']>=1,1,0)
all_labels_majVotes_main['Lable_Mythical']=np.where(all_labels_majVotes_main['Lable_Mythical']>=1,1,0)
all_labels_majVotes_main['Lable_Animal']=np.where(all_labels_majVotes_main['Lable_Animal']>=1,1,0)
all_labels_majVotes_main.reset_index(inplace=True)

In [14]:
all_labels_majVotes_main

Unnamed: 0,FileName,Lable_Tree,Lable_Mythical,Lable_Animal
0,01_binladen.jpg,1,1,1
1,03_manasa.jpg,0,1,1
2,04_binladen.jpg,1,1,1
3,05_binladen.jpg,1,1,1
4,06_chandi.jpg,1,1,1
5,07_gujarat.jpg,1,1,1
6,08_tsunami.jpg,0,1,1
7,09_shotopir.jpg,1,1,1
8,11_weddingofthefish.jpg,0,0,1
9,12_victimizationofwomen.jpg,1,0,0


In [15]:
all_labels_majVotes_final=all_labels_majVotes_main.append(all_labels_majVotes_parts)

In [16]:
all_labels_majVotes_final

Unnamed: 0,FileName,Lable_Tree,Lable_Mythical,Lable_Animal
0,01_binladen.jpg,1,1,1
1,03_manasa.jpg,0,1,1
2,04_binladen.jpg,1,1,1
3,05_binladen.jpg,1,1,1
4,06_chandi.jpg,1,1,1
...,...,...,...,...
199,36_thegirlchild_part4.jpg,0,0,0
200,36_thegirlchild_part5.jpg,0,0,0
201,36_thegirlchild_part6.jpg,0,0,0
202,36_thegirlchild_part7.jpg,0,0,0


### Reading all Images and Extracting Features

In [17]:
#All images should be placed under Images folder in the source directory
mypath=os.getcwd()+'/'+'Images'
imgfiles = [f for f in os.listdir(mypath) if isfile(join(mypath, f)) and '.jpg' in join(mypath, f)]

In [18]:
image_features=[]
#features_dict={}
for raw_img in imgfiles:
    image = imread(mypath+'/'+raw_img)
    resized_img = transform.resize(image, (100, 50))
    feature_matrix = np.zeros((resized_img.shape[0],resized_img.shape[1])) 
    for i in range(0,resized_img.shape[0]):
        for j in range(0,resized_img.shape[1]):
            feature_matrix[i][j] = round((resized_img[i,j,0] + resized_img[i,j,1] + resized_img[i,j,2])/3,2)
    features = np.reshape(feature_matrix, (100*50))
    features_dict={}
    features_dict['file']=raw_img
    features_dict['feature']=features   
    image_features.append(features_dict)

In [19]:
imagedf = pd.DataFrame(image_features, columns =['file', 'feature'])

In [20]:
imagedf2=pd.DataFrame(imagedf["feature"].to_list())

In [21]:
imagedf_features=imagedf[['file']].join(imagedf2)

In [22]:
# Appending Labels to Image Features
imagedf_allLables=imagedf_features.join(all_labels_majVotes_final)

In [23]:
imagedf_allLables.head()

Unnamed: 0,file,0,1,2,3,4,5,6,7,8,...,4994,4995,4996,4997,4998,4999,FileName,Lable_Tree,Lable_Mythical,Lable_Animal
0,01_binladen.jpg,0.72,0.7,0.71,0.72,0.72,0.72,0.73,0.74,0.76,...,0.71,0.71,0.71,0.71,0.71,0.71,01_binladen.jpg,1.0,1.0,1.0
1,01_binladen_part1.jpg,0.09,0.14,0.16,0.19,0.16,0.15,0.17,0.19,0.21,...,0.44,0.15,0.14,0.43,0.08,0.36,03_manasa.jpg,0.0,1.0,1.0
1,01_binladen_part1.jpg,0.09,0.14,0.16,0.19,0.16,0.15,0.17,0.19,0.21,...,0.44,0.15,0.14,0.43,0.08,0.36,01_binladen_part1.jpg,1.0,0.0,1.0
2,01_binladen_part2.jpg,0.09,0.11,0.13,0.64,0.31,0.08,0.04,0.06,0.25,...,0.71,0.43,0.34,0.59,0.67,0.68,04_binladen.jpg,1.0,1.0,1.0
2,01_binladen_part2.jpg,0.09,0.11,0.13,0.64,0.31,0.08,0.04,0.06,0.25,...,0.71,0.43,0.34,0.59,0.67,0.68,01_binladen_part2.jpg,1.0,1.0,0.0


### Writing the final Image Features and Labels to CSV File

In [24]:
imagedf_allLables=imagedf_allLables.drop(['FileName'], axis=1)
#imagedf_allLables.T.reset_index().to_csv('PycharmProjects/pythonProject/all_feat_lab_transpose.csv')

In [25]:
# Image Features for all the main JPG files
imagedf_allLables_main=imagedf_allLables[~imagedf_allLables.file.str.contains('_part')]
imagedf_allLables_main.to_csv('all_feat_lab_main.csv')
#imagedf_allLables_main.T.reset_index().to_csv('PycharmProjects/pythonProject/all_feat_lab_main_transpose.csv')

In [26]:
imagedf_allLables_main

Unnamed: 0,file,0,1,2,3,4,5,6,7,8,...,4993,4994,4995,4996,4997,4998,4999,Lable_Tree,Lable_Mythical,Lable_Animal
0,01_binladen.jpg,0.72,0.7,0.71,0.72,0.72,0.72,0.73,0.74,0.76,...,0.67,0.71,0.71,0.71,0.71,0.71,0.71,1.0,1.0,1.0
6,03_manasa.jpg,0.76,0.76,0.76,0.71,0.7,0.72,0.73,0.69,0.67,...,0.64,0.65,0.65,0.69,0.75,0.76,0.77,0.0,1.0,1.0
13,04_binladen.jpg,0.73,0.76,0.76,0.74,0.73,0.69,0.66,0.66,0.69,...,0.72,0.74,0.74,0.75,0.76,0.76,0.75,1.0,1.0,1.0
20,05_binladen-179x1024.jpg,0.78,0.79,0.78,0.75,0.7,0.67,0.67,0.65,0.65,...,0.67,0.69,0.7,0.7,0.71,0.71,0.72,0.0,0.0,1.0
28,06_chandi.jpg,0.78,0.78,0.79,0.78,0.79,0.83,0.83,0.83,0.82,...,0.65,0.65,0.62,0.61,0.68,0.72,0.71,1.0,1.0,1.0
28,06_chandi.jpg,0.78,0.78,0.79,0.78,0.79,0.83,0.83,0.83,0.82,...,0.65,0.65,0.62,0.61,0.68,0.72,0.71,0.0,0.0,1.0
37,07_gujarat.jpg,0.4,0.4,0.3,0.28,0.27,0.28,0.27,0.28,0.33,...,0.62,0.64,0.53,0.63,0.63,0.61,0.21,0.0,1.0,1.0
45,08_tsunami.jpg,0.2,0.29,0.29,0.31,0.47,0.34,0.42,0.6,0.4,...,0.72,0.51,0.55,0.48,0.52,0.7,0.4,0.0,0.0,0.0
50,09_shotopir.jpg,0.73,0.72,0.73,0.71,0.72,0.72,0.72,0.73,0.72,...,0.61,0.61,0.61,0.64,0.69,0.69,0.7,0.0,1.0,0.0
57,11_weddingofthefish.jpg,0.72,0.73,0.72,0.7,0.7,0.72,0.71,0.74,0.76,...,0.64,0.66,0.64,0.66,0.69,0.68,0.66,0.0,1.0,1.0


# Text Feature Extraction

### Reading all the Text Files from the directory

In [27]:
#All text files should be placed under text folder
mypath=os.getcwd()+'/'+'Text'
txtfiles = [f for f in os.listdir(mypath) if isfile(join(mypath, f))]

#Reading the Text and Image Mapping file
image_text_mapping=pd.read_excel('LinkingImageAndText.xlsx',skiprows=1)

### Extracting the Text file features and Tokenising the words

In [28]:
fileList=[]

for textfile in txtfiles:
    f=open(mypath+'/'+textfile, 'r', encoding = 'cp850')
    lines=[]
    for line in f:
        if ('Name:' not in line) and ('Translation Source:' not in line) and ('Artist:' not in line) and ('Text:' not in line):
            #print(line)
            lines.append(line.split())
    word_list=[word.lower() for line in lines for word in line]
    tokens_without_sw = [re.sub('[ôçö,.();!?£]', '', word) for word in word_list if not word in stopwords.words()+['the','there','this','&']]
    
    tokendict={}
    tokendict['file']=textfile
    #tokendict['text_filemap_col']=text_filemap_col
    tokendict['tokens']=tokens_without_sw
    fileList.append(tokendict)

In [29]:
text_feature_df=pd.DataFrame(data=fileList)

In [30]:
all_labels_majVotes_main

Unnamed: 0,FileName,Lable_Tree,Lable_Mythical,Lable_Animal
0,01_binladen.jpg,1,1,1
1,03_manasa.jpg,0,1,1
2,04_binladen.jpg,1,1,1
3,05_binladen.jpg,1,1,1
4,06_chandi.jpg,1,1,1
5,07_gujarat.jpg,1,1,1
6,08_tsunami.jpg,0,1,1
7,09_shotopir.jpg,1,1,1
8,11_weddingofthefish.jpg,0,0,1
9,12_victimizationofwomen.jpg,1,0,0


### Tagging the majority Vote Labels to Text File 

In [31]:
image_text_mapping.drop(image_text_mapping.columns[[0,1]], axis=1, inplace=True)
image_text_mapping.columns=['ScrollNumber','ImageFileName','TextFileName']
image_file_lables=all_labels_majVotes_main[['FileName','Lable_Tree','Lable_Mythical','Lable_Animal']]
image_file_lables.columns=['file','Label_Tree','Label_Mythical','Label_Animal']

In [32]:
image_txt_lables=image_file_lables.merge(image_text_mapping,how='left' ,left_on='file', right_on='ImageFileName',
          suffixes=('_image', '_text'))

In [33]:
text_labels=image_txt_lables[~image_txt_lables.file.str.contains('_part')]
text_labels=text_labels[['TextFileName','ImageFileName','Label_Tree','Label_Mythical','Label_Animal']]
text_labels.dropna(subset=['TextFileName'],inplace=True)
text_labels

Unnamed: 0,TextFileName,ImageFileName,Label_Tree,Label_Mythical,Label_Animal
0,LadenStory.txt,01_binladen.jpg,1,1,1
1,ManashaMangal.txt,03_manasa.jpg,0,1,1
2,BinLaden.txt,04_binladen.jpg,1,1,1
4,ChandiMangal_v2.txt,06_chandi.jpg,1,1,1
5,GujratRiots.txt,07_gujarat.jpg,1,1,1
6,Tsunami.txt,08_tsunami.jpg,0,1,1
7,SatyaPir.txt,09_shotopir.jpg,1,1,1
8,WeddingOfFish_v2.txt,11_weddingofthefish.jpg,0,0,1
9,VictimizationWomen_v2.txt,12_victimizationofwomen.jpg,1,0,0
11,VictimizationWomen.txt,15_victimizationofgirls.jpg,1,0,1


In [34]:
text_feature_lab_df=text_feature_df.merge(text_labels,how='left', left_on='file', right_on='TextFileName')

In [35]:
text_feature_lab_df.dropna(subset=['Label_Tree','Label_Mythical','Label_Animal'],inplace=True)
text_feature_lab_df.drop(['TextFileName'], axis=1,inplace=True)
text_feature_lab_df.drop_duplicates(subset=['file','ImageFileName','Label_Tree','Label_Mythical','Label_Animal'], keep='last',inplace=True)
#text_feature_lab_df.reset_index(drop=True)

### One Hot ENcoding of Text Features

In [36]:
text_feature_lab_df_OHE = text_feature_lab_df['tokens'].str.join('|').str.get_dummies()
result = pd.concat([text_feature_lab_df, text_feature_lab_df_OHE], axis=1)
result.drop(['tokens'],axis=1,inplace=True)

### Writing the Final Text Features File to CSV

In [37]:
result.to_csv('text_feat_lab.csv',index=False)