# Data Transformation Steps

## Environment Setup

In [None]:
# Go to data location

%cd  ~/Desktop/DeepLearningProject/BertSum/raw_data

In [None]:
import os
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

genre = pd.read_csv('/content/drive/My Drive/Models Running/Summarization/datasets/genre_final_for_summarization.tsv', sep='\t')

In [None]:
genre.drop(columns=['song', 'artist', 'genre', 'lyrics'], inplace=True)

genre.rename(columns={'genre_encoded': "encoded"}, inplace=True)

In [None]:
def create_stories(line, folder):
  
  path = '/content/drive/My Drive/Models Running/BertSum/raw_data/'+ folder + '/' + str(line['encoded'])

  if not os.path.exists(path):
    os.makedirs(path)
  
  path += '/' + str(line.name) + '.story'
  
  file = open(path,"w") 
  file.writelines(line['lyrics_nchar']) 
  file.close() 

In [None]:
print("Starting with Genre...\n\n")
dummy = genre.apply(lambda line: create_stories(line, 'genre'), axis=1)

In [None]:
# Insert needed path dependency

os.environ['CLASSPATH'] = '/Users/johntzemos/Desktop/DeepLearningProject/BertSum/stanford-corenlp-full-2018-10-05/stanford-corenlp-3.9.2.jar'

In [None]:
# Move to the BertSum source folder to begin transforming the data

%cd /Users/johntzemos/Desktop/DeepLearningProject/BertSum/src

In [None]:
# Install Needed Packages

! pip install pytorch_pretrained_bert
! pip install tensorboardX
! pip install pyrouge
! pip install multiprocess 

In [None]:
import sys
# Add the home directory to sys.path
sys.path.append('~/Desktop/DeepLearningProject/BertSum/src')
sys.path.append('~/Desktop/DeepLearningProject/BertSum/src/prepro')

from tqdm import tqdm

# Step 1 - Sentence Splitting and Tokenization

In [None]:
rootdir = '/content/drive/My Drive/Models Running/BertSum/raw_data/genre'

subdirectories = [os.path.join(rootdir, o) for o in os.listdir(rootdir) if os.path.isdir(os.path.join(rootdir,o))]

for subdir in subdirectories:
  
  save_path = '/content/drive/My Drive/Models Running/BertSum/raw_data/genre_tokenized/' + subdir.split('/')[-1]
  
  if not os.path.exists(save_path):
    os.makedirs(save_path)
      
  os.system("! python preprocess.py -mode tokenize -raw_path '" + subdir + "' -save_path '" + save_path + "' -log_file '/content/drive/My Drive/Models Running/BertSum/logs/genre.log'")

# Step 2 - Format to Simpler Json Files

In [None]:
from data_builder import load_json
from pathlib import Path

import json

rootdir = '~/Desktop/DeepLearningProject/BertSum/raw_data/genre_tokenized'

subdirectories = [os.path.join(rootdir, o) for o in os.listdir(rootdir) if os.path.isdir(os.path.join(rootdir,o))]

progress_bar = tqdm(subdirectories)

for subdir in progress_bar:
  
  save_path = '~/Desktop/DeepLearningProject/BertSum/raw_data/genre_json/' + subdir.split('/')[-1]
  
  if not os.path.exists(save_path):
    os.makedirs(save_path)
  
  filelist = Path(subdir).glob('*.json')
  
  for path in filelist:
    
    # because path is object not string
    path_in_str = str(path)

    source, tgt = load_json(path_in_str, 'lower')
    
    format_to_save = {'src': source, 'tgt': tgt}
    file_num = path_in_str.split('/')[-1].split('.')[0]
    
    pt_file = "{:s}/{:s}.{:s}.{:s}.json".format(save_path, 'story', 'test', file_num)
    pc_t +=1
    with open(pt_file, 'w') as save:
      
        save.write(json.dumps(format_to_save))
 

# Step 3 - Format to PyTorch Files

In [None]:
rootdir = '~/Desktop/DeepLearningProject/BertSum/raw_data/genre_json'

subdirectories = [os.path.join(rootdir, o) for o in os.listdir(rootdir) if os.path.isdir(os.path.join(rootdir,o))]

progress_bar = tqdm(subdirectories)

for subdir in progress_bar:
  
    save_path = '~/Desktop/DeepLearningProject/BertSum/raw_data/genre_bert/' + subdir.split('/')[-1]

    if not os.path.exists(save_path):
        os.makedirs(save_path)

    os.system("! python preprocess.py -mode format_to_bert -dataset test -raw_path '" + subdir + "' -save_path '" + save_path + "' -oracle_mode greedy -n_cpus 4 -log_file ~/Desktop/DeepLearningProject/BertSum/logs/preprocess.log")