In [27]:
import numpy as np
import pandas as pd
import os
import string
from pprint import pprint
import pickle

BASE_DIR = '../data'

## Read Text Files into Pandas

In [28]:
# globals to store data
train_df = pd.DataFrame(columns=['file_name', 'text', 'image_loc', 'class'])
test_df = pd.DataFrame(columns=['file_name', 'text', 'image_loc', 'class'])
val_df = pd.DataFrame(columns=['file_name', 'text', 'image_loc', 'class'])
vocabulary = set()

# translate text from post 
def translate(text):
    translator = str.maketrans('', '', string.punctuation)
    # remove lower case
    text = text.lower()
    text = text.replace('\n', ' ')
    # remove punctuation
    text = text.translate(translator)
    return text

Next we iterate through all of the class files and extract the text data and store the posts in a data frame with their corresponding labels and images.

In [29]:
# iterate through file dataset and extract text data
text_data_fn = os.path.join(BASE_DIR, 'text_data')
img_data_fn = os.path.join(BASE_DIR, 'img_data')
for split in os.listdir(text_data_fn):
    split_dir = os.path.join(text_data_fn, split)
    if os.path.isdir(split_dir):
        for class_name in os.listdir(split_dir):
            class_dir = os.path.join(split_dir, class_name)
            if os.path.isdir(class_dir):
                for text_name in os.listdir(class_dir):
                    # construct image file name
                    image_file = os.path.join(img_data_fn, split, class_name, text_name[:-4] + '.jpg')

                    # get text data
                    text_file = os.path.join(class_dir, text_name)
                    f = open(text_file, 'r') 
                    text = translate(f.read())
                    vocabulary.update(set(text.split()))

                    # append to appropriate dataset
                    if split == 'train': #df = train_df
                        train_df = train_df.append({'file_name': text_name, 
                               'text': text, 
                               'image_loc': image_file, 
                               'class': class_name}, 
                              ignore_index=True)
                    elif split == 'val': #df = train_df
                        val_df = val_df.append({'file_name': text_name, 
                               'text': text, 
                               'image_loc': image_file, 
                               'class': class_name}, 
                              ignore_index=True)
                    else:
                        test_df = test_df.append({'file_name': text_name, 
                               'text': text, 
                               'image_loc': image_file, 
                               'class': class_name}, 
                              ignore_index=True)

In [14]:
train_df = pd.concat([train_df,
                      pd.get_dummies(train_df['class'], 
                                     prefix='class',
                                     dummy_na=True)],
                     axis=1).drop(['class'], axis=1)
test_df = pd.concat([test_df,
                      pd.get_dummies(test_df['class'], 
                                     prefix='class',
                                     dummy_na=True)],
                     axis=1).drop(['class'], axis=1)
val_df = pd.concat([val_df,
                      pd.get_dummies(val_df['class'], 
                                     prefix='class',
                                     dummy_na=True)],
                     axis=1).drop(['class'], axis=1)

                                      file_name  \
0           yemencrisis_2015-11-27_01-40-28.txt   
1            isiscrimes_2015-08-03_11-18-40.txt   
2          terrorattack_2017-10-02_13-13-23.txt   
3          victimsofwar_2016-12-17_02-12-27.txt   
4            isiscrimes_2015-08-01_10-57-48.txt   
5            isiscrimes_2015-08-13_05-42-23.txt   
6            isiscrimes_2015-08-05_19-58-42.txt   
7            isiscrimes_2015-12-10_17-27-35.txt   
8            isiscrimes_2015-08-02_17-07-03.txt   
9            isiscrimes_2015-08-23_06-53-41.txt   
10           isiscrimes_2015-08-27_00-11-43.txt   
11          yemencrisis_2017-07-31_06-24-52.txt   
12         terrorattack_2017-10-02_18-29-18.txt   
13           isiscrimes_2015-11-20_14-34-12.txt   
14           isiscrimes_2015-08-22_02-31-53.txt   
15           isiscrimes_2015-12-12_22-49-40.txt   
16           isiscrimes_2015-07-25_13-04-32.txt   
17           isiscrimes_2015-12-12_18-52-25.txt   
18           earthquake_2017-11

In [45]:
one_hots = pd.get_dummies(train_df['class'])
train_df = train_df.join(one_hots)
train_df = train_df.drop('class', axis = 1)

one_hots = pd.get_dummies(val_df['class'])
val_df = val_df.join(one_hots)
val_df = val_df.drop('class', axis = 1)

one_hots = pd.get_dummies(test_df['class'])
test_df = test_df.join(one_hots)
test_df = test_df.drop('class', axis = 1)

Save all of the text data and vocabulary list in pickles

In [46]:
path_to_textf_train = os.path.join('../intermediates', 'cleaned_text_train.pkl')
path_to_textf_test = os.path.join('../intermediates', 'cleaned_text_test.pkl')
path_to_textf_val = os.path.join('../intermediates', 'cleaned_text_val.pkl')
train_df.to_pickle(path_to_textf_train)
test_df.to_pickle(path_to_textf_test)
val_df.to_pickle(path_to_textf_val)

path_to_vocabf = os.path.join('../intermediates', 'text_vocabulary.pkl')
with open(path_to_vocabf, 'wb') as pickle_file:
    pickle.dump(sorted(vocabulary), pickle_file)

In [47]:
print(test_df.iloc[[0]]['damaged_infrastructure'].values[0])

0
