In [185]:
# Merge all
import glob
import json
from datetime import datetime

def is_valid_date(date_str):
    if date_str is None:
        return True
    try:
        datetime.strptime(date_str, "%d %B %Y")
        return True
    except ValueError:
        return False

def make_date_format(date_str):
    if date_str == None:
        return None
    date_obj = datetime.strptime(date_str, "%B %d, %Y")
    return date_obj.strftime("%d %B %Y")

json_files = glob.glob('articles/*.json')
print(json_files)

all_articles = []

for filepath in json_files:
    with open(filepath, 'r', encoding='utf-8') as file:
        data = json.load(file)
        all_articles.extend(data)

for article in all_articles:
    if not is_valid_date(article['date']):
        article['date'] = make_date_format(article['date'])

with open('articles/all_articles.json', 'w', encoding='utf-8') as file:
    json.dump(all_articles, file, ensure_ascii=False, indent=4)

['articles/NCE_articles_NEW.json', 'articles/NucNet_articles_NEW.json', 'articles/WNN_articles.json', 'articles/NEI_articles_NEW.json']


In [216]:
# Data exploration
import pandas as pd
import random
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

def count_words(paragraphs):
    if paragraphs is None:
        return 0
    return sum(len(paragraph.split()) for paragraph in paragraphs if paragraph)

def get_stats(data):
    data.loc[:,'num_words'] = data['text'].apply(count_words)

    num_articles_total = f'{data.shape[0]:,}'
    num_words_total = sum(data['num_words'])

    rows = data.shape[0]
    cols = f'{data.shape[1]:,}'
    earliest = min(data['date']).date()
    latest = max(data['date']).date()

    article_breakdown = data.groupby('magasine').agg(
        num_articles=('text', 'size'),
        total_words=('num_words', 'sum')
    ).reset_index()
    article_breakdown['percentage_articles'] = (article_breakdown['num_articles'] / rows) * 100
    article_breakdown['percentage_words'] = (article_breakdown['total_words'] / num_words_total) * 100
    
    num_words_total = sum(data['num_words'])

    # Format numbers
    article_breakdown['num_articles'] = article_breakdown['num_articles'].apply(lambda x: f"{x:,}")
    article_breakdown['total_words'] = article_breakdown['total_words'].apply(lambda x: f"{x:,}")
    article_breakdown['percentage_articles'] = article_breakdown['percentage_articles'].apply(lambda x: f"{x:.2f}%")
    article_breakdown['percentage_words'] = article_breakdown['percentage_words'].apply(lambda x: f"{x:.2f}%")
    num_words_formatted = f'{num_words_total:,}'

    print('-'*40)
    print(f'Total num articles: {num_articles_total:>20}')
    print(f'Total num words: {num_words_formatted:>23}')
    print(f'Earliest date: {str(earliest):>25}')
    print(f'Latest date: {str(latest):>27}\n')
    print(article_breakdown)
    print('-'*98)
    print('\n')

In [217]:
import pandas as pd

file_path = 'articles/all_articles.json'

# Try loading the JSON file
try:
    df = pd.read_json(file_path)
    print("JSON loaded successfully.")
except ValueError as e:
    print(f"Error loading JSON: {e}")
    
print(df.info())

JSON loaded successfully.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7929 entries, 0 to 7928
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   url       7929 non-null   object        
 1   magasine  7929 non-null   object        
 2   title     7904 non-null   object        
 3   author    7488 non-null   object        
 4   date      7905 non-null   datetime64[ns]
 5   text      7904 non-null   object        
dtypes: datetime64[ns](1), object(5)
memory usage: 371.8+ KB
None


In [218]:
get_stats(df)

df['date'] = pd.to_datetime(df['date'])
earliest = pd.to_datetime('2022-06-14')
latest = pd.to_datetime('2024-06-14')

df_updated = df[(df['date'] >= earliest) & (df['date'] <= latest)]

get_stats(df_updated)

----------------------------------------
Total num articles:                7,929
Total num words:               4,259,281
Earliest date:                1999-01-14
Latest date:                  2024-06-25

                            magasine num_articles total_words percentage_articles percentage_words
0                 New Civil Engineer          174     139,671               2.19%            3.28%
1                             NucNet          892     367,935              11.25%            8.64%
2  Nuclear Engineering International        6,451   3,494,769              81.36%           82.05%
3                 World Nuclear News          412     256,906               5.20%            6.03%
--------------------------------------------------------------------------------------------------


----------------------------------------
Total num articles:                3,597
Total num words:               2,102,514
Earliest date:                2022-06-14
Latest date:                  2024