In [1]:
import numpy as np
import pandas as pd
import os
import string
from pprint import pprint
import pickle

BASE_DIR = '../data'

## Read Text Files into Pandas

In [22]:
# globals to store data
train_df = pd.DataFrame(columns=['file_name', 'text', 'image_loc', 'class'])
test_df = pd.DataFrame(columns=['file_name', 'text', 'image_loc', 'class'])
val_df = pd.DataFrame(columns=['file_name', 'text', 'image_loc', 'class'])
vocabulary = set()

# translate text from post 
def translate(text):
    translator = str.maketrans('', '', string.punctuation)
    # remove lower case
    text = text.lower()
    text = text.replace('\n', ' ')
    # remove punctuation
    text = text.translate(translator)
    return text

Next we iterate through all of the class files and extract the text data and store the posts in a data frame with their corresponding labels and images.

In [23]:
# iterate through file dataset and extract text data
text_data_fn = os.path.join(BASE_DIR, 'text_data')
img_data_fn = os.path.join(BASE_DIR, 'img_data')
for split in os.listdir(text_data_fn):
    split_dir = os.path.join(text_data_fn, split)
    if os.path.isdir(split_dir):
        for class_name in os.listdir(split_dir):
            class_dir = os.path.join(split_dir, class_name)
            if os.path.isdir(class_dir):
                for text_name in os.listdir(class_dir):
                    # construct image file name
                    image_file = os.path.join(img_data_fn, split, class_name, text_name[:-4] + '.jpg')

                    # get text data
                    text_file = os.path.join(class_dir, text_name)
                    f = open(text_file, 'r') 
                    text = translate(f.read())
                    vocabulary.update(set(text.split()))

                    # append to appropriate dataset
                    if split == 'train': #df = train_df
                        train_df = train_df.append({'file_name': text_name, 
                               'text': text, 
                               'image_loc': image_file, 
                               'class': class_name}, 
                              ignore_index=True)
                    elif split == 'val': #df = train_df
                        val_df = val_df.append({'file_name': text_name, 
                               'text': text, 
                               'image_loc': image_file, 
                               'class': class_name}, 
                              ignore_index=True)
                    else:
                        test_df = test_df.append({'file_name': text_name, 
                               'text': text, 
                               'image_loc': image_file, 
                               'class': class_name}, 
                              ignore_index=True)

In [30]:
train_df = train_df.sort_values(['image_loc'], ascending=True)
train_df = train_df.reset_index(drop=True)

test_df = test_df.sort_values(['image_loc'], ascending=True)
test_df = test_df.reset_index(drop=True)

val_df = val_df.sort_values(['image_loc'], ascending=True)
val_df = val_df.reset_index(drop=True)

In [34]:
test_df

Unnamed: 0,file_name,text,image_loc,damaged_infrastructure,damaged_nature,fires,flood,human_damage,non_damage
0,accrafloods_2015-06-05_08-03-39.txt,030615 neveragain accrafloods prayforghana,../data/img_data/test/damaged_infrastructure/a...,1,0,0,0,0,0
1,accrafloods_2015-06-08_17-11-08.txt,the water broke through three homes to get to ...,../data/img_data/test/damaged_infrastructure/a...,1,0,0,0,0,0
2,buildingcollapse_2012-10-30_01-14-19.txt,facade of the building around the corner gone,../data/img_data/test/damaged_infrastructure/b...,1,0,0,0,0,0
3,buildingcollapse_2012-10-30_04-49-23.txt,partial building collapse right near our nyc h...,../data/img_data/test/damaged_infrastructure/b...,1,0,0,0,0,0
4,buildingcollapse_2012-10-31_18-39-52.txt,hurricane hurricanesandy damage disaster build...,../data/img_data/test/damaged_infrastructure/b...,1,0,0,0,0,0
5,buildingcollapse_2013-06-06_00-49-10.txt,thats my priusc that totally saved me and the ...,../data/img_data/test/damaged_infrastructure/b...,1,0,0,0,0,0
6,buildingcollapse_2013-06-06_01-56-29.txt,cleanup still going on at 22nd and market for ...,../data/img_data/test/damaged_infrastructure/b...,1,0,0,0,0,0
7,buildingcollapse_2013-06-06_19-51-23.txt,igersphilly buildingcollapse view from down 22...,../data/img_data/test/damaged_infrastructure/b...,1,0,0,0,0,0
8,buildingcollapse_2013-06-06_19-55-12.txt,igersphilly buildingcollapse from up 22nd street,../data/img_data/test/damaged_infrastructure/b...,1,0,0,0,0,0
9,buildingcollapse_2013-06-06_23-42-22.txt,philly buildingcollapse,../data/img_data/test/damaged_infrastructure/b...,1,0,0,0,0,0


Now perform one hot encoding of classes 

In [31]:
one_hots = pd.get_dummies(train_df['class'])
train_df = train_df.join(one_hots)
train_df = train_df.drop('class', axis = 1)

one_hots = pd.get_dummies(val_df['class'])
val_df = val_df.join(one_hots)
val_df = val_df.drop('class', axis = 1)

one_hots = pd.get_dummies(test_df['class'])
test_df = test_df.join(one_hots)
test_df = test_df.drop('class', axis = 1)

In [32]:
path_to_textf_train = os.path.join('../intermediates', 'cleaned_text_train.csv')
path_to_textf_test = os.path.join('../intermediates', 'cleaned_text_test.csv')
path_to_textf_val = os.path.join('../intermediates', 'cleaned_text_val.csv')

train_df.to_csv(path_to_textf_train, index=False)
test_df.to_csv(path_to_textf_test, index=False)
val_df.to_csv(path_to_textf_val, index=False)

Save all of the text data and vocabulary list in pickles

In [33]:
path_to_textf_train = os.path.join('../intermediates', 'cleaned_text_train.pkl')
path_to_textf_test = os.path.join('../intermediates', 'cleaned_text_test.pkl')
path_to_textf_val = os.path.join('../intermediates', 'cleaned_text_val.pkl')

train_df.to_pickle(path_to_textf_train)
test_df.to_pickle(path_to_textf_test)
val_df.to_pickle(path_to_textf_val)

path_to_vocabf = os.path.join('../intermediates', 'text_vocabulary.pkl')
with open(path_to_vocabf, 'wb') as pickle_file:
    pickle.dump(sorted(vocabulary), pickle_file)

In [47]:
print(test_df.iloc[[0]]['damaged_infrastructure'].values[0])

0
