<a href="https://colab.research.google.com/github/tanawinvisa/nature-webscrape-data/blob/main/DataForNetworkViz.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import json
import os
import zipfile

In [None]:
years = range(2018, 2024)  # Years from 2018 to 2023
for year in years:
    # Construct the URL
    url = f"https://github.com/nnatchy/DSDE_Project/raw/main/{year}_test.zip"
    # Construct the wget command to download and rename the file directly
    !wget {url} -O {year}.zip

--2024-05-06 19:53:43--  https://github.com/nnatchy/DSDE_Project/raw/main/2018_test.zip
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/nnatchy/DSDE_Project/main/2018_test.zip [following]
--2024-05-06 19:53:43--  https://raw.githubusercontent.com/nnatchy/DSDE_Project/main/2018_test.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6913340 (6.6M) [application/zip]
Saving to: ‘2018.zip’


2024-05-06 19:53:43 (241 MB/s) - ‘2018.zip’ saved [6913340/6913340]

--2024-05-06 19:53:43--  https://github.com/nnatchy/DSDE_Project/raw/main/2019_test.zip
Resolving github.com (github.com)... 140.82.114.4
Con

In [None]:
def process_files(year_range, base_path):
    batch_years = []
    for year in year_range:
        # Determine the directory pattern based on the year
        year_directory = f"{base_path}/{year}/" if year == 2018 else f"{base_path}/{year}/"

        # Ensure the year directory exists and create if not
        os.makedirs(year_directory, exist_ok=True)

        # Check for zip files in the base path and unzip them to the year directory
        zip_file_path = os.path.join(base_path, f'{year}.zip')
        if os.path.exists(zip_file_path):
            with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
                zip_ref.extractall(year_directory)
                print(f"Extracted {zip_file_path} to {year_directory}")

by = process_files(range(2018, 2024), ".")

Extracted ./2018.zip to ./2018/
Extracted ./2019.zip to ./2019/
Extracted ./2020.zip to ./2020/
Extracted ./2021.zip to ./2021/
Extracted ./2022.zip to ./2022/
Extracted ./2023.zip to ./2023/


In [None]:
def process_author(bibrecord_data):
    head = bibrecord_data.get("head", {})
    head_output_data = {
        "author_groups": [],
        "correspondence": [],
        "enhancement": [],
        "citation_title": head.get("citation-title", ""),
        "abstracts": head.get("abstracts", "")
    }

    # Process author groups
    author_groups = head.get("author-group", [])
    if isinstance(author_groups, dict):
        author_groups = [author_groups]

    for author in author_groups:
        affi = author.get("affiliation", {})
        org = affi.get("organization", [])
        organization_names = [org.get("$", "")] if isinstance(org, dict) else [o.get("$", "") for o in org if isinstance(o, dict)]
        authors_list = author.get("author", [])
        authors_list = [authors_list] if isinstance(authors_list, dict) else authors_list

        for person in authors_list:
            author_info = {
                "indexed-name": person.get("preferred-name", {}).get("ce:indexed-name", ""),
                "seq": person.get("@seq", ""),
                "auid": person.get("@auid", ""),
                "affiliation": {
                    "affiliation_id": affi.get("@afid", ""),
                    "dpt_id": affi.get("@dptid", ""),
                    "country": affi.get("country", ""),
                    "organization": organization_names
                }
            }
            head_output_data["author_groups"].append(author_info)

    return head_output_data

In [None]:
# with open("./2018/2018 copy/201800010", 'r') as file:
#     json_data = json.load(file)
# abstracts_info = json_data.get("abstracts-retrieval-response", {})["item"]["bibrecord"]
# bibliographic_data = process_author(abstracts_info)

In [None]:
def append_network(nodes, edges, seen_pairs, seen_nodes, author_groups):
  for i, author1 in enumerate(author_groups):
      # Add author1 to nodes list if not already added
      if author1['auid'] not in seen_nodes:
          seen_nodes.add(author1['auid'])
          nodes.append({
              'Id': author1['auid'],
              'Label': author1['indexed-name']
          })

      # Create edges between all pairs of authors within the same group
      for j in range(i + 1, len(author_groups)):
          author2 = author_groups[j]

          # Avoid pairing author with themselves and ensure unique edges
          if author1['auid'] != author2['auid']:
              sorted_pair = tuple(sorted([author1['auid'], author2['auid']]))
              if sorted_pair not in seen_pairs:
                  seen_pairs.add(sorted_pair)
                  edges.append({
                      'Source': sorted_pair[0],
                      'Target': sorted_pair[1],
                      'Type': 'Undirected',  # Change to 'Directed' if needed
                      'Weight': 1  # Modify or calculate as necessary
                  })

In [None]:
import chardet

nodes = []
edges = []
seen_nodes = set()
seen_pairs = set()

def process_directory(base_path, nodes, edges, seen_pairs, seen_nodes):
    for year in range(2018, 2024):  # Assuming years 2018 to 2023
        year_path = os.path.join(base_path, str(year), f'{year} copy')
        if os.path.isdir(year_path):
            for file_name in os.listdir(year_path):
                file_path = os.path.join(year_path, file_name)
                # Detect file encoding
                with open(file_path, 'rb') as f:  # open in binary mode
                    raw_data = f.read()
                    result = chardet.detect(raw_data)
                    encoding = result['encoding']

                # Read the file with detected encoding
                with open(file_path, 'r', encoding=encoding) as file:
                    json_data = json.load(file)
                    abstracts_info = json_data.get("abstracts-retrieval-response", {})["item"]["bibrecord"]
                    bibliographic_data = process_author(abstracts_info)

                    append_network(nodes, edges, seen_pairs, seen_nodes, bibliographic_data['author_groups'])

                    print(f'Processed {file_name}.')

    # After processing all files
    nodes_df = pd.DataFrame(nodes)
    edges_df = pd.DataFrame(edges)

    nodes_df.to_csv('nodes.csv', index=False)
    edges_df.to_csv('edges.csv', index=False)

process_directory('./', nodes, edges, seen_pairs, seen_nodes)


Processed 201800253.
Processed 201800190.
Processed 201800165.
Processed 201800060.
Processed 201800134.
Processed 201800331.
Processed 201800175.
Processed 201800202.
Processed 201800105.
Processed 201800018.
Processed 201800256.
Processed 201800075.
Processed 201800121.
Processed 201800137.
Processed 201800284.
Processed 201800155.
Processed 201800036.
Processed 201800044.
Processed 201800309.
Processed 201800275.
Processed 201800236.
Processed 201800307.
Processed 201800278.
Processed 201800102.
Processed 201800193.
Processed 201800151.
Processed 201800244.
Processed 201800010.
Processed 201800192.
Processed 201800239.
Processed 201800322.
Processed 201800032.
Processed 201800312.
Processed 201800045.
Processed 201800000.
Processed 201800274.
Processed 201800016.
Processed 201800116.
Processed 201800185.
Processed 201800261.
Processed 201800107.
Processed 201800220.
Processed 201800084.
Processed 201800304.
Processed 201800308.
Processed 201800288.
Processed 201800210.
Processed 201

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
nodes_df = pd.DataFrame(nodes)
edges_df = pd.DataFrame(edges)

nodes_df.to_csv('nodes.csv', index=False)
edges_df.to_csv('edges.csv', index=False)