In [1]:
from pathlib import Path
import os
from zipfile import ZipFile
import pandas as pd

In [2]:
def load_data():
    raw_zipfile_path = Path(os.getcwd()) / "data" / "news_aggregator.zip"
    raw_zipfile = ZipFile(raw_zipfile_path, metadata_encoding="utf-8")
    with raw_zipfile:
        with raw_zipfile.open("newsCorpora.csv") as news_corpora:
            df = pd.read_csv(
                news_corpora,
                sep="\t",
                header=None,
                names=[
                    "ID",
                    "TITLE",
                    "URL",
                    "PUBLISHER",
                    "CATEGORY",
                    "STORY",
                    "HOSTNAME",
                    "TIMESTAMP",
                ],
            )
            return df


df = load_data()
df

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather, should not slow taper",http://www.latimes.com/business/money/la-fi-mo-federal-reserve-plosser-stimulus-economy-20140310...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change in pace of tapering,http://www.livemint.com/Politics/H2EvwJSK2VE6OF7iK1g3PP/Feds-Charles-Plosser-sees-high-bar-for-c...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints at accelerated tapering,http://www.ifamagazine.com/news/us-open-stocks-fall-after-fed-official-hints-at-accelerated-tape...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles Plosser says",http://www.ifamagazine.com/news/fed-risks-falling-behind-the-curve-charles-plosser-says-294430,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Growth,http://www.moneynews.com/Economy/federal-reserve-charles-plosser-weather-job-growth/2014/03/10/i...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027
...,...,...,...,...,...,...,...,...
422414,422933,Surgeons to remove 4-year-old's rib to rebuild damaged throat - CBS 3 ...,http://www.cbs3springfield.com/story/26378648/surgeons-removed-4-year-olds-rib-to-rebuild-his-da...,WSHM-TV,m,dpcLMoJD69UYMXMxaoEFnWql9YjQM,www.cbs3springfield.com,1409229190251
422415,422934,Boy to have surgery on esophagus after battery burns hole in throat,http://www.wlwt.com/news/boy-to-have-surgery-on-esophagus-after-battery-burns-hole-in-throat/277...,WLWT Cincinnati,m,dpcLMoJD69UYMXMxaoEFnWql9YjQM,www.wlwt.com,1409229190508
422416,422935,Child who swallowed battery to have reconstructive surgery at Cincinnati ...,http://www.newsnet5.com/news/local-news/child-who-swallowed-battery-to-have-reconstructive-surge...,NewsNet5.com,m,dpcLMoJD69UYMXMxaoEFnWql9YjQM,www.newsnet5.com,1409229190771
422417,422936,Phoenix boy undergoes surgery to repair throat damage - WFSB 3 Connecticut,http://www.wfsb.com/story/26368078/phoenix-boy-undergoes-surgery-to-repair-throat-damage-from-bu...,WFSB,m,dpcLMoJD69UYMXMxaoEFnWql9YjQM,www.wfsb.com,1409229191071


In [15]:
from sklearn.model_selection import train_test_split

def p50():
    selected_publishers = set(["Reuters", "Huffington Post", "Businessweek", "Contactmusic.com", "Daily Mail"])
    selected_articles = df[df["PUBLISHER"].isin(selected_publishers)]
    selected_articles_with_shuffled = selected_articles.sample(frac=1)
    assert len(selected_articles_with_shuffled) == 13340
    # https://stackabuse.com/scikit-learns-traintestsplit-training-testing-and-validation-sets/
    train_ratio = 0.8
    test_ratio = 0.1
    validation_ratio = 0.1
    X_train, X_test = train_test_split(selected_articles_with_shuffled, test_size=test_ratio)
    X_train, X_validate = train_test_split(X_train, test_size=validation_ratio/(train_ratio+test_ratio))
    X_train: pd.DataFrame
    X_validate: pd.DataFrame
    X_test: pd.DataFrame
    assert X_train.shape == (10672, 8)
    assert X_validate.shape == (1334, 8)
    assert X_test.shape == (1334, 8)
    output_dir = Path(os.getcwd()) / "data"
    def dump(selected: pd.DataFrame, output_filename: str):
        selected.loc[:,["CATEGORY","TITLE"]].to_csv(output_dir / output_filename, sep="\t", index=None, header=None)
    dump(X_train, "train.txt")
    dump(X_validate, "valid.txt")
    dump(X_test, "test.txt")


p50()