In [2]:
!pip install datasets huggingface-hub pandas



In [3]:
from datasets import load_dataset, concatenate_datasets
from huggingface_hub import list_repo_files
import pandas as pd

In [4]:
files = list(list_repo_files(repo_id='davanstrien/AmericanStories-parquet', repo_type='dataset'))
len(files)

266

In [5]:
years = {
        "1938": [],
        "1939": [],
        "1940": [],
        "1941": [],
        "1942": [],
        "1943": [],
        "1944": [],
        "1945": []
         }
for year in years:
    for file in files:
        file_year = file.split('/')[-1].split("-")[0]
        if year in file_year:
            years[year].append(file)
print(years)

{'1938': ['data/1938-00000-of-00002-091c3285e9718147.parquet', 'data/1938-00001-of-00002-ee8604fd83ff776f.parquet'], '1939': ['data/1939-00000-of-00002-fc041e884dae3517.parquet', 'data/1939-00001-of-00002-00a4ab7e54bcd155.parquet'], '1940': ['data/1940-00000-of-00001-cc9e4c07d2868aac.parquet'], '1941': ['data/1941-00000-of-00002-90d725e865fcd277.parquet', 'data/1941-00001-of-00002-ff89c400ae6e2954.parquet'], '1942': ['data/1942-00000-of-00002-73a4a9ef208b48aa.parquet', 'data/1942-00001-of-00002-f2b2743c00a1e8b3.parquet'], '1943': ['data/1943-00000-of-00001-08d4eecdd9d63294.parquet'], '1944': ['data/1944-00000-of-00001-0898daee07832f8f.parquet'], '1945': ['data/1945-00000-of-00002-f78a1735b619bf2b.parquet', 'data/1945-00001-of-00002-eea83a5833687118.parquet']}


In [6]:
datasets = []
for year, files in years.items():
    dataset = load_dataset(
        "davanstrien/AmericanStories-parquet",
        data_files={year: files},
        split=year,
        verification_mode='no_checks'
    )
    datasets.append(dataset)

print(datasets)

[Dataset({
    features: ['article_id', 'newspaper_name', 'edition', 'date', 'page', 'headline', 'byline', 'article'],
    num_rows: 665274
}), Dataset({
    features: ['article_id', 'newspaper_name', 'edition', 'date', 'page', 'headline', 'byline', 'article'],
    num_rows: 556283
}), Dataset({
    features: ['article_id', 'newspaper_name', 'edition', 'date', 'page', 'headline', 'byline', 'article'],
    num_rows: 496662
}), Dataset({
    features: ['article_id', 'newspaper_name', 'edition', 'date', 'page', 'headline', 'byline', 'article'],
    num_rows: 637200
}), Dataset({
    features: ['article_id', 'newspaper_name', 'edition', 'date', 'page', 'headline', 'byline', 'article'],
    num_rows: 523923
}), Dataset({
    features: ['article_id', 'newspaper_name', 'edition', 'date', 'page', 'headline', 'byline', 'article'],
    num_rows: 467200
}), Dataset({
    features: ['article_id', 'newspaper_name', 'edition', 'date', 'page', 'headline', 'byline', 'article'],
    num_rows: 433769
})

In [7]:
combined_dataset = concatenate_datasets(datasets)
combined_dataset

Dataset({
    features: ['article_id', 'newspaper_name', 'edition', 'date', 'page', 'headline', 'byline', 'article'],
    num_rows: 4368788
})

In [8]:
df = pd.DataFrame(combined_dataset)
df

Unnamed: 0,article_id,newspaper_name,edition,date,page,headline,byline,article
0,1_1938-11-08_p3_sn82014085_00393347429_1938110...,The Waterbury Democrat.,01,1938-11-08,p3,Fear Heavy Toll Among Civilians In Next Conflict,,Recognition of a probable heavy toll among non...
1,3_1938-11-08_p3_sn82014085_00393347429_1938110...,The Waterbury Democrat.,01,1938-11-08,p3,,,"Conforming to tradition, the Democratic candid..."
2,4_1938-11-08_p3_sn82014085_00393347429_1938110...,The Waterbury Democrat.,01,1938-11-08,p3,Audience Thrilled By\n\n Early Masters Works\n...,,second by Kasper Ferdinand Fisch- CT.\n\n FOlk...
3,5_1938-11-08_p3_sn82014085_00393347429_1938110...,The Waterbury Democrat.,01,1938-11-08,p3,Democrats Institute Court Action T oday\n\n To...,,in behalf of Charles Maloney of Se4 East Main ...
4,6_1938-11-08_p3_sn82014085_00393347429_1938110...,The Waterbury Democrat.,01,1938-11-08,p3,q WOMEN SEEK\n\n ELECTION JOBS\n\nNone Candida...,BY RUBY A. BLACK nifAd preea Sfaff Corresnovad...,NUnlte0 freSS Stam COrreSpOn0ent)\n\n Washingt...
...,...,...,...,...,...,...,...,...
4368783,28_1945-12-05_p7_sn88063294_00340589130_194512...,Detroit evening times.,01,1945-12-05,p7,,,Here's a contest you win! And what prize!\nA b...
4368784,10_1945-04-18_p35_sn83045462_00280604082_19450...,Evening star.,01,1945-04-18,p35,"ADVERTISEMENT,\n\nADNL n lSLhLt>1\nTorment OF ...",,II you can't get your feet of your mind\nbecau...
4368785,3_1945-04-18_p35_sn83045462_00280604082_194504...,Evening star.,01,1945-04-18,p35,Jury Rules Chaplin\nIs.Father;; Conference\nOn...,By the Associated Press.,"By the Associated Press.\n\n\nLOS ANGELES, Apr..."
4368786,21_1945-04-18_p35_sn83045462_00280604082_19450...,Evening star.,01,1945-04-18,p35,Use Your\nBeIdqet Account,,"It's easy as A-B-C to open a\nCharge, Budget o..."


In [10]:
df.to_parquet('../data/american_stories_1938_1945.parquet')