In [15]:
import numpy as np
import pandas as pd
import os
import string
import pickle

BASE_DIR = '../data'

## Read Text Files into Pandas

In [16]:
# globals to store data
train_df = pd.DataFrame(columns=['file_name', 'text', 'image_loc', 'class'])
test_df = pd.DataFrame(columns=['file_name', 'text', 'image_loc', 'class'])
val_df = pd.DataFrame(columns=['file_name', 'text', 'image_loc', 'class'])
vocabulary = set()

# translate text from post 
def translate(text):
    translator = str.maketrans('', '', string.punctuation)
    # remove lower case
    text = text.lower()
    # remove punctuation
    text = text.translate(translator)
    return text

Next we iterate through all of the class files and extract the text data and store the posts in a data frame with their corresponding labels and images.

In [17]:
# iterate through file dataset and extract text data
text_data_fn = os.path.join(BASE_DIR, 'text_data')
img_data_fn = os.path.join(BASE_DIR, 'img_data')
for split in os.listdir(text_data_fn):
    split_dir = os.path.join(text_data_fn, split)
    if os.path.isdir(split_dir):
        for class_name in os.listdir(split_dir):
            class_dir = os.path.join(split_dir, class_name)
            if os.path.isdir(class_dir):
                for text_name in os.listdir(class_dir):
                    # construct image file name
                    image_file = os.path.join(img_data_fn, split, class_name, text_name[:-4] + '.jpg')

                    # get text data
                    text_file = os.path.join(class_dir, text_name)
                    f = open(text_file, 'r') 
                    text = translate(f.read())
                    vocabulary.update(set(text.split()))

                    # append to appropriate dataset
                    if split == 'train': df = train_df
                    elif split == 'test' : df = test_df
                    else: df = val_df
                        
                    df = df.append({'file_name': text_name, 
                               'text': text, 
                               'image_loc': image_file, 
                               'class': class_name}, 
                              ignore_index=True)

Save all of the text data and vocabulary list in pickles

In [18]:
path_to_textf_train = os.path.join('../cleaned_data', 'cleaned_text_train.pkl')
path_to_textf_test = os.path.join('../cleaned_data', 'cleaned_text_test.pkl')
path_to_textf_val = os.path.join('../cleaned_data', 'cleaned_text_val.pkl')
train_df.to_pickle(path_to_textf_train)
test_df.to_pickle(path_to_textf_test)
val_df.to_pickle(path_to_textf_val)

path_to_vocabf = os.path.join('../cleaned_data', 'text_vocabulary.pkl')
with open(path_to_vocabf, 'wb') as pickle_file:
    pickle.dump(sorted(vocabulary), pickle_file)