In [4]:
# Rename magasine column
import json
import pandas as pd

filepath = 'articles/NEI_articles.json'
with open(filepath, 'r', encoding='utf-8') as file:
    data = json.load(file)

for article in data:
    article['magasine'] = 'Nuclear Engineering International'

with open(filepath, 'w', encoding='utf-8') as file:
    json.dump(data, file, ensure_ascii=False, indent=4)


In [None]:
# Data exploration, check consistency etc.
df = pd.read_json('NCE_articles.json')
a = df['author'].unique()
for e in a:
    print(e)


In [1]:
# Merging articles
import glob
import json

json_files = glob.glob('articles/*.json')

all_articles = []

for filepath in json_files:
    with open(filepath, 'r', encoding='utf-8') as file:
        data = json.load(file)
        all_articles.extend(data)

with open('articles/all_articles_NEW.json', 'w', encoding='utf-8') as file:
    json.dump(all_articles, file, ensure_ascii=False, indent=4)

In [98]:
# Data exploration
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

df = pd.read_json('articles/all_articles_NEW.json')

In [99]:
def count_words(paragraphs):
    if paragraphs is None:
        return 0
    return sum(len(paragraph.split()) for paragraph in paragraphs if paragraph)

def get_stats(data):
    rows = f'{data.shape[0]:,}'
    rows2 = data.shape[0]
    cols = f'{data.shape[1]:,}'
    earliest = min(data['date']).date()
    latest = max(data['date']).date()
    num_words_total = sum(data['num_words'])

    article_breakdown = data.groupby('magasine').agg(
        num_articles=('text', 'size'),
        total_words=('num_words', 'sum')
    ).reset_index()
    article_breakdown['percentage_articles'] = (article_breakdown['num_articles'] / rows2) * 100
    article_breakdown['percentage_words'] = (article_breakdown['total_words'] / num_words_total) * 100
    
    num_words_total = sum(data['num_words'])

    # Format numbers
    article_breakdown['num_articles'] = article_breakdown['num_articles'].apply(lambda x: f"{x:,}")
    article_breakdown['total_words'] = article_breakdown['total_words'].apply(lambda x: f"{x:,}")
    article_breakdown['percentage_articles'] = article_breakdown['percentage_articles'].apply(lambda x: f"{x:.2f}%")
    article_breakdown['percentage_words'] = article_breakdown['percentage_words'].apply(lambda x: f"{x:.2f}%")
    num_words_formatted = f'{num_words_total:,}'

    print(f'Total num articles: {rows:>20}')
    print(f'Total num words: {num_words_formatted:>23}')
    print(f'Earliest date: {str(earliest):>25}')
    print(f'Latest date: {str(latest):>27}\n')
    print(article_breakdown)

df['num_words'] = df['text'].apply(count_words)

cutoff = '2022-06-14' # Exactly 2 years
df_new = df[df['date'] >= cutoff]
df_new.loc[:,'num_words'] = df_new['text'].apply(count_words)
get_stats(df_new)

training = .60
testing = .20
validation = .20

Total num articles:                3,542
Total num words:               2,072,201
Earliest date:                2022-06-14
Latest date:                  2024-06-14

                            magasine num_articles total_words percentage_articles percentage_words
0                 New Civil Engineer          137     111,629               3.87%            5.39%
1                             NucNet          376     179,656              10.62%            8.67%
2  Nuclear Engineering International        2,995   1,758,646              84.56%           84.87%
3                 World Nuclear News           34      22,270               0.96%            1.07%


In [34]:
text = 'hello this is a little test'

print(len(text.split(' ')))

6
