In [None]:
!pip install PyPDF2


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
from PyPDF2 import PdfReader
from google.colab import files

def extract_metadata(pdf_path):
    with open(pdf_path, 'rb') as f:
        pdf = PdfReader(f)
        info = pdf.metadata
        title = info.get('/Title', None)
        author = info.get('/Author', None)
        year = info.get('/CreationDate', None)
        if year:
            year = int(year[2:6])

    return {'title': title, 'author': author, 'year': year}

uploaded = files.upload()

for filename in uploaded.keys():
    metadata = extract_metadata(filename)
    print(metadata)


Saving s10502-022-09396-1 (1).pdf to s10502-022-09396-1 (1).pdf
{'title': 'Introduction: challenges and prospects of born-digital and digitized archives in the digital humanities', 'author': 'Lise Jaillant ', 'year': 2022}


In [None]:
from PyPDF2 import PdfReader
from google.colab import files

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as f:
        reader = PdfReader(f)
        text = ''
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
    return text

uploaded = files.upload()

for filename in uploaded.keys():
    pdf_text = extract_text_from_pdf(filename)
    print(pdf_text)


Saving s10502-022-09396-1 (1).pdf to s10502-022-09396-1 (1) (1).pdf
Vol.:(0123456789)Archival Science (2022) 22:285–291
https://doi.org/10.1007/s10502-022-09396-1
1 3
EDITORIAL
Introduction: challenges and prospects of born‑digital 
and digitized archives in the digital humanities
Lise Jaillant1  · Katie Aske2 · Eirini Goudarouli3 · Natasha Kitcher2
Published online: 26 May 2022 
© The Author(s), under exclusive licence to Springer Nature B.V. 2022, corrected publication 2022
The scale and complexity of digital archives, both born-digital and digitized, are 
posing enormous challenges for both researchers and memory institutions. In the 
world of archives, these new types of records are fundamentally changing the land-
scape as well as the role of archivists and archival institutions. The emergence of 
new generation technologies also brings a variety of complexities and challenges 
to archival frameworks, requiring new capabilities and approaches on how best to 
capture, preserve, con

In [None]:
from transformers import pipeline
from PyPDF2 import PdfReader
from google.colab import files

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as f:
        reader = PdfReader(f)
        text = ''
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
    return text

def extract_information(text):
    nlp = pipeline("question-answering")
    result = nlp(question="What is the study about?", context=text)
    return result['answer']

uploaded = files.upload()

for filename in uploaded.keys():
    pdf_text = extract_text_from_pdf(filename)
    study_description = extract_information(pdf_text)
    print(study_description)


No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Saving s10502-022-09396-1 (1).pdf to s10502-022-09396-1 (1) (2).pdf


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Archival Science


In [None]:
import re

def validate_year(year):
    year_pattern = re.compile(r'^\d{4}$')
    if year_pattern.match(year):
        return True
    else:
        return False

year = '2023'
is_valid = validate_year(year)
print(is_valid)


True


In [None]:
import sqlite3
from sqlite3 import Error
import re
from PyPDF2 import PdfReader
from google.colab import files
from transformers import pipeline

def create_connection(db_file):
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)
    return conn

def create_table(conn, create_table_sql):
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        print(e)

def insert_article(conn, article):
    sql = ''' INSERT INTO articles(title,author,year)
              VALUES(?,?,?) '''
    cur = conn.cursor()
    cur.execute(sql, article)
    return cur.lastrowid

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as f:
        reader = PdfReader(f)
        text = ''
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
    return text

def extract_information(text):
    nlp = pipeline("question-answering")
    result = nlp(question="What is the study about?", context=text)
    return result['answer']

def extract_metadata(pdf_path):
    with open(pdf_path, 'rb') as f:
        reader = PdfReader(f)
        info = reader.metadata
        title = info.get('/Title', None)
        author = info.get('/Author', None)
        year = info.get('/CreationDate', None)
        if year:
            year = int(re.search(r'\d{4}', year).group())

    return {'title': title, 'author': author, 'year': year}

uploaded = files.upload()

database = 'articles.db'
conn = create_connection(database)
create_table_sql = """
CREATE TABLE IF NOT EXISTS articles (
    id INTEGER PRIMARY KEY,
    title TEXT NOT NULL,
    author TEXT NOT NULL,
    year INTEGER
);
"""
if conn is not None:
    create_table(conn, create_table_sql)
else:
    print("Error! cannot create the database connection.")

for filename in uploaded.keys():
    pdf_text = extract_text_from_pdf(filename)
    metadata = extract_metadata(filename)
    metadata['year'] = metadata['year'] if metadata['year'] else 0
    metadata_tuple = (metadata['title'], metadata['author'], metadata['year'])
    article_id = insert_article(conn, metadata_tuple)
    print("Inserted article ID:", article_id)

conn.commit()
conn.close()


Saving s10502-022-09396-1 (1).pdf to s10502-022-09396-1 (1) (3).pdf
Inserted article ID: 1


In [None]:
import sqlite3

database = 'articles.db'
conn = sqlite3.connect(database)

def show_table(conn, table_name):
    cursor = conn.cursor()
    cursor.execute(f"SELECT * FROM {table_name}")
    rows = cursor.fetchall()
    for row in rows:
        print(row)

show_table(conn, 'articles')

conn.close()


(1, 'Sample Title', 'John Doe', 2023)
(2, 'Introduction: challenges and prospects of born-digital and digitized archives in the digital humanities', 'Lise Jaillant ', 2022)
(3, 'Introduction: challenges and prospects of born-digital and digitized archives in the digital humanities', 'Lise Jaillant ', 2022)


In [None]:
import sqlite3

# Function to create a database connection
def create_connection(db_file):
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except sqlite3.Error as e:
        print(e)
    return conn

# Function to create a table in the database
def create_table(conn, create_table_sql):
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except sqlite3.Error as e:
        print(e)

# Connect to the database
database = 'articles.db'
conn = create_connection(database)

# Define the SQL statement to create the articles table
create_table_sql = """
CREATE TABLE IF NOT EXISTS articles (
    id INTEGER PRIMARY KEY,
    title TEXT NOT NULL,
    year INTEGER,
    journal TEXT,
    authors TEXT,
    abstract TEXT,
    introduction TEXT,
    methodology TEXT,
    results TEXT,
    conclusions TEXT,
    keywords TEXT
);
"""

# Create the articles table
if conn is not None:
    create_table(conn, create_table_sql)
    print("Table created successfully.")
else:
    print("Error! Cannot create the database connection.")

# Close the database connection
conn.close()


Table created successfully.


In [None]:
import sqlite3

# Function to create a database connection
def create_connection(db_file):
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except sqlite3.Error as e:
        print(e)
    return conn

# Function to create a table in the database
def create_table(conn, create_table_sql):
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except sqlite3.Error as e:
        print(e)

# Function to display the schema of the database table
def display_table_schema(conn, table_name):
    cursor = conn.cursor()
    cursor.execute(f"PRAGMA table_info({table_name})")
    rows = cursor.fetchall()
    print("Table Schema:")
    for row in rows:
        print(row)

# Function to display the contents of the database table
def display_table_contents(conn, table_name):
    cursor = conn.cursor()
    cursor.execute(f"SELECT * FROM {table_name}")
    rows = cursor.fetchall()
    print("\nTable Contents:")
    for row in rows:
        print(row)

# Connect to the database
database = 'articles.db'
conn = create_connection(database)

# Define the SQL statement to create the articles table
create_table_sql = """
CREATE TABLE IF NOT EXISTS articles (
    id INTEGER PRIMARY KEY,
    title TEXT NOT NULL,
    year INTEGER,
    journal TEXT,
    authors TEXT,
    abstract TEXT,
    introduction TEXT,
    methodology TEXT,
    results TEXT,
    conclusions TEXT,
    keywords TEXT
);
"""

# Create the articles table
if conn is not None:
    create_table(conn, create_table_sql)
    print("Table created successfully.")
else:
    print("Error! Cannot create the database connection.")

# Display the table schema
display_table_schema(conn, 'articles')

# Close the database connection
conn.close()


Table created successfully.
Table Schema:
(0, 'id', 'INTEGER', 0, None, 1)
(1, 'title', 'TEXT', 1, None, 0)
(2, 'author', 'TEXT', 1, None, 0)
(3, 'year', 'INTEGER', 0, None, 0)
