Dataset
==

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
#load needed packages
import pandas as pd
import random 
import os
from dataclasses import dataclass, field
from typing import List
from sklearn.model_selection import StratifiedShuffleSplit

In [3]:
# make directory for data 
DATA_PATH = 'data/'
os.makedirs(DATA_PATH, exist_ok=True)
RAW_DATA = f'{DATA_PATH}raw_data/'
os.makedirs(RAW_DATA, exist_ok=True)

## 1. Loading Dataset


### 1.1. Download Dataset
First we download and extract the gigaword dataset (~3M) [here](https://drive.google.com/file/d/0B6N7tANPyVeBNmlSX19Ld2xDU1E/view?usp=sharing)

Meantime using the BBC dataset (7M) here: https://www.kaggle.com/pariza/bbc-news-summary/download

Data downloaded and stored under `data/BBC News Summary`

In [4]:
#download data (CNN)
# !curl --header "Host: doc-0c-3o-docs.googleusercontent.com" --header "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" --header "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" --header "Accept-Language: en-US,en;q=0.9" --header "Cookie: AUTH_s4a6oitvorbtivfcjm2pefm907l1ntir=08690680304265769485|1531648800000|pm1jthdhng09cikkb0pkdcqlod4d76p8" --header "Connection: keep-alive" "https://doc-0c-3o-docs.googleusercontent.com/docs/securesc/d1n9duui70mcvt9ph3953bv4foh1d3fm/pb1h3k6beg14nfv16poorm9mr6bl90e3/1531656000000/03129501499031348422/08690680304265769485/0B6N7tANPyVeBNmlSX19Ld2xDU1E?e=download" -o "summary.tar.gz" -L



In [5]:
#extract files 
# !tar -xzf summary.tar.gz -C {RAW_DATA} && mv summary.tar.gz {RAW_DATA}
# !gunzip {RAW_DATA}sumdata/train/*.*.txt.gz 

### 1.2. Import Datasets

In [6]:
@dataclass()
class BBCNewsDataReader:
    base_folder: str
    exclusion: list = field(default_factory=list)
    
    @property
    def news_articles_folder(self):
        return self.base_folder + '/News Articles'
    
    @property
    def summaries_folder(self):
        return self.base_folder + '/Summaries'
    
    @property
    def categories(self):
        exclusion_folders = lambda x: x not in [".DS_Store"] + self.exclusion
        return filter(exclusion_folders, os.listdir(self.news_articles_folder))
    
    def to_df(self):
        df = pd.DataFrame(columns=['article', 'summary', 'category', 'filename'])
        for article_folder, summary_folder in self.__category_folders():
            category = article_folder.split('/')[-1]
            for filename in os.listdir(article_folder):
                if os.path.isfile(f'{article_folder}/{filename}'):
                    try:
                        article = self.__read_file(f'{article_folder}/{filename}')
                        summary = self.__read_file(f'{summary_folder}/{filename}')
                        df = df.append({'article': article, 'summary': summary, 'category': category, 'filename': filename}, ignore_index=True)
                    except UnicodeDecodeError:
                        pass
        return df
                
            
    def __category_folders(self):
        return [
            (f'{self.news_articles_folder}/{category}', f'{self.summaries_folder}/{category}') for category in self.categories
        ]
    
    
    def __read_file(self, filepath):
        with open(filepath) as file:
            return file.read()


In [9]:
data = BBCNewsDataReader(
        base_folder='data/raw_data/BBC News Summary',
        # exclusion=['entertainment', 'tech', 'sport', 'politics'] # remove these to read all data
    ).to_df()

In [10]:
data.shape

(2224, 4)

In [11]:
data.head()

Unnamed: 0,article,summary,category,filename
0,Musicians to tackle US red tape\n\nMusicians' ...,Nigel McCune from the Musicians' Union said Br...,entertainment,289.txt
1,"U2's desire to be number one\n\nU2, who have w...",But they still want more.They have to want to ...,entertainment,262.txt
2,Rocker Doherty in on-stage fight\n\nRock singe...,"Babyshambles, which he formed after his acrimo...",entertainment,276.txt
3,Snicket tops US box office chart\n\nThe film a...,A Series of Unfortunate Events also stars Scot...,entertainment,060.txt
4,Ocean's Twelve raids box office\n\nOcean's Twe...,"Ocean's Twelve, the crime caper sequel starrin...",entertainment,074.txt


In [None]:
#TODO: Add Validation Split (later, using train and test for now)

In [12]:
data.category.value_counts()

business         510
sport            510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [13]:
# 80, 20 split
split = StratifiedShuffleSplit( n_splits = 1 , test_size = 0.2 , random_state = 42 ) 
for train_index , test_index in split.split(data, data['category']):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index] 

In [14]:
strat_test_set.category.value_counts()

business         102
sport            102
politics          84
tech              80
entertainment     77
Name: category, dtype: int64

In [15]:
strat_train_set.category.value_counts()

business         408
sport            408
politics         333
tech             321
entertainment    309
Name: category, dtype: int64

## 2. Preprocessing Dataset

In [16]:
#TODO: Add stratified sample by category

In [17]:
# for i in range(5):
#     r = random.randint(0,50)
#     display(data.iloc[r]['article'])
#     display(data.iloc[r]['summary'])


In [18]:
SAMPLE_DATA_PATH = 'data/sample_data'

In [20]:
#save sample train, val, and test datasets 
strat_train_set.to_csv(f'{SAMPLE_DATA_PATH}/train_ds.csv', index=None)
strat_test_set.to_csv(f'{SAMPLE_DATA_PATH}/valid_ds.csv', index=None)

### Create Smaller Sample Data (optional)

In [22]:
# sample_train_ = strat_train_set.sample(64)
# sample_val_ = strat_test_set.sample(16)

In [23]:
# len(sample_train_), len(sample_val_)

In [11]:
# #save sample train, val, and test datasets 
# sample_train_.to_csv(f'{SAMPLE_DATA_PATH}train_ds_.csv', index=None)
# sample_val_.to_csv(f'{SAMPLE_DATA_PATH}valid_ds_.csv', index=None)