In [2]:
import numpy as np
import pandas as pd
from collections import defaultdict
import random
import pickle
import os

In [5]:
def readData(num_to_keep):
    data = pd.read_csv('abcnews.csv')
    data = data[['headline_text']]
    
    num_to_remove = data.shape[0] - num_to_keep
    entries_to_remove = np.random.choice(data.index, num_to_remove, replace=False)
    data = data.drop(entries_to_remove)
    data = data.reset_index()
    data = data.set_index('index').T.to_dict()
    
    return data

def processData(data):
    headlines = []
    for d in data:
        headlines.append(['<START>'] + data[d]['headline_text'].split() + ['<END>'])
    len_list = [len(h) for h in headlines]
             
    return headlines

# borrowed from LIGN 167 homeword 4 starter code
def make_word_to_ix(sents):
    word_to_ix = {}
    num_unique_words = 0
    for sent in sents:
        for word in sent:
            if word not in word_to_ix:
                word_to_ix[word] = num_unique_words
                num_unique_words += 1

    return word_to_ix

def encodeHeadlines(headlines, word_index):
    res = []
    for hl in headlines:
        encoded_hl = []
        for w in hl:
            encoded_hl.append(word_index[w])
        res.append(encoded_hl)
    return res

def createData(num_sample, train_portion, data_version):
    data = readData(num_sample)
    headlines = processData(data)
    word_ix = make_word_to_ix(headlines)
    encoded_headlines = encodeHeadlines(headlines, word_ix)

    print('length of word_ix: ', len(word_ix))
    print('length of data: ', len(data))

    # split train/test
    numTrain = int(num_sample * train_portion)
    train = encoded_headlines[:numTrain]
    test = encoded_headlines[numTrain:]
    
    ix_word = defaultdict(str)
    for w in word_ix:
        ix_word[word_ix[w]] = w
    
    # create a folder for the version to store the related stuff
    path = os.path.join('data', data_version)
    if os.path.exists(path) == False:
        os.makedirs(path)
    
    description = f'train: {len(train)}, test: {len(test)}'
    desc = open(os.path.join(path, 'description.txt'), 'w')
    desc.write(description + '\n')
    desc.close()
    
    word_ix_str = os.path.join(path, f'word_ix.pkl')
    ix_word_str = os.path.join(path, f'ix_word.pkl')
    train_str = os.path.join(path, f'train.pkl')
    test_str = os.path.join(path, f'test.pkl')
    
    pickle.dump(word_ix, open(word_ix_str, 'wb'))
    pickle.dump(ix_word, open(ix_word_str, 'wb'))
    pickle.dump(train, open(train_str, 'wb'))
    pickle.dump(test, open(test_str, 'wb'))

In [6]:
createData(15000, 0.85, data_version='v9')

length of word_ix:  16217
length of data:  15000
