In [1]:
import pandas as pd

In [3]:
!pip install lxml

Collecting lxml
  Downloading lxml-5.2.1-cp310-cp310-macosx_10_9_x86_64.whl (4.6 MB)
[K     |████████████████████████████████| 4.6 MB 1.2 MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-5.2.1
You should consider upgrading via the '/Users/yonatanlou/.pyenv/versions/3.10.0/envs/QumranNLP/bin/python3.10 -m pip install --upgrade pip' command.[0m


In [4]:


# Load all tables from the Wikipedia page
wikipedia_tables = pd.read_html("https://en.wikipedia.org/wiki/List_of_the_Dead_Sea_Scrolls#Qumran_Cave_4")
wikipedia_tables = wikipedia_tables[1:-1]
# Check how many tables were loaded
print(f"Total tables loaded: {len(wikipedia_tables)}")

# Define the table names as per the user's specification
tables = "Cave 1,Cave 2,Cave 3,Cave 4Q1–4Q100,Cave 4Q101–4Q200,Cave 4Q201–4Q300,4Q301-,Cave 5,Cave 6,Cave 7,Cave 8,Cave 9,Cave 10,Cave 11,Wadi Murabba'at Cave 1,Nahal Hever Cave 8,Masada".split(",")

standard_columns = ['Fragment or scroll identifier', 'Fragment or scroll name', 'Alternative identifier', 'English Bible Association', 'Language', 'Date/script', 'Description', 'Reference']

# Initialize an empty DataFrame for the aggregated data with the standardized column names plus 'table_name'
wikipedia_data = pd.DataFrame(columns=standard_columns + ['table_name'])

# Iterate over each table, standardize its column names, and add the table name
for i, table in enumerate(wikipedia_tables):
    # Skip if the table doesn't match the expected size to prevent errors
    if len(table.columns) == len(standard_columns):
        # Rename the columns of the current table
        table.columns = standard_columns
        # Since we are unsure of the exact tables user intends to exclude or include, we'll process all tables
        table['table_name'] = f"Table {tables[i]}"  # You can adjust the naming convention as needed
        # Concatenate the current table with the aggregated DataFrame
        wikipedia_data = pd.concat([wikipedia_data, table], ignore_index=True)
wikipedia_data.to_csv(f"DSS_origin_of_each_book.csv", index=False)

Total tables loaded: 17


In [5]:
wikipedia_data = pd.read_csv(f"DSS_origin_of_each_book.csv")

In [6]:
wikipedia_data

Unnamed: 0,Fragment or scroll identifier,Fragment or scroll name,Alternative identifier,English Bible Association,Language,Date/script,Description,Reference,table_name
0,1QIsaa,Great Isaiah Scroll,,Isaiah 1:1–31; 2:1–22; 3:1–5:30; 6:1–13; 7:1–2...,Hebrew,356–103 BCE/150–100 BCE,Contains all 66 chapters with occasional lacun...,[9][10],Table Cave 1
1,1QIsab,Isaiah,cf. 1Q8,The Book of Isaiah,Hebrew,Hasmonean/Herodian,A second copy of portions of the Book of Isaiah,[11][12],Table Cave 1
2,1QS,"Serekh ha-Yahad or ""Community Rule""",,,Hebrew,,"cf. 4QSa-j = 4Q255–64, 5Q11",[13],Table Cave 1
3,1QpHab,Pesher on Habakkuk,,Habakkuk 1–2,Hebrew,Later half of the 1st century BC,Commentary on Habakkuk 1:2–17; 2:1–20,[14][15],Table Cave 1
4,1QM,Milhamah or War Scroll,,,Hebrew,,"cf. 4Q491, 4Q493; 11Q14?",,Table Cave 1
...,...,...,...,...,...,...,...,...,...
432,Masada Deuteronomy,Deuteronomy,MasDeut,Deuteronomy 33:17–24; 34:2–6,Hebrew,,,,Table Masada
433,Masada Psalmsa,Psalms,MasPsalmsa,"Psalms 81–85, ending with 85:5",Hebrew,,,,Table Masada
434,Masada Psalmsb,Psalms,MasPsalmsb,Psalms 150,Hebrew,,,,Table Masada
435,Masada Ezekiel,Ezekiel,MasEzek,"Ezekiel 35:11–15; 36:1–10, 13–14, 17–35; 37:1–...",Hebrew,,,,Table Masada


In [8]:
wikipedia_data[wikipedia_data["Fragment or scroll name"]=='Serekh ha-Yahad or "Community Rule"']

Unnamed: 0,Fragment or scroll identifier,Fragment or scroll name,Alternative identifier,English Bible Association,Language,Date/script,Description,Reference,table_name
2,1QS,"Serekh ha-Yahad or ""Community Rule""",,,Hebrew,,"cf. 4QSa-j = 4Q255–64, 5Q11",[13],Table Cave 1


In [16]:
import yaml
with open("/Users/yonatanlou/dev/QumranNLP/data/yamls/all_texts_by_composition.yaml") as f:
    all_texts_by_composition = yaml.safe_load(f)
    
all_texts_by_composition

{'4QS': ['4Q255',
  '4Q256',
  '4Q257',
  '4Q258',
  '4Q259',
  '4Q260',
  '4Q261',
  '4Q262',
  '4Q263',
  '4Q264',
  '5Q11'],
 'CD': ['CD'],
 '4QD': ['4Q266',
  '4Q267',
  '4Q268',
  '4Q269',
  '4Q270',
  '4Q271',
  '4Q272',
  '4Q273',
  '6Q15'],
 'Hodayot': ['1QHa', '1Q35'],
 '4QH': ['4Q427', '4Q428', '4Q429', '4Q430', '4Q431', '4Q432'],
 '1QM': ['1QM'],
 '4QM': ['4Q491', '4Q492', '4Q493', '4Q494', '4Q495', '4Q496'],
 'Songs_of_Maskil': ['4Q510', '4Q511'],
 'Pesharim': ['1QpHab',
  '1Q14',
  '1Q15',
  '1Q16',
  '4Q161',
  '4Q162',
  '4Q163',
  '4Q164',
  '4Q165',
  '4Q166',
  '4Q167',
  '4Q168',
  '4Q169',
  '4Q170',
  '4Q171',
  '4Q173'],
 'Mysteries': ['1Q27', '4Q299', '4Q300', '4Q301'],
 'Catena_Florilegium': ['4Q174', '4Q177'],
 'Berakhot': ['4Q286', '4Q287', '4Q288', '4Q289'],
 'Instruction': ['4Q415',
  '4Q416',
  '4Q417',
  '4Q418',
  '4Q418a',
  '4Q418b',
  '4Q418c',
  '4Q423'],
 'Book_of_Jubilees': ['1Q17',
  '1Q18',
  '4Q216',
  '4Q217',
  '4Q218',
  '4Q219',
  '4Q220',
  

In [21]:
df_list = []
for key, value in all_texts_by_composition.items():
    temp_df = pd.DataFrame(value, columns=['book'])
    temp_df['composition'] = key
    df_list.append(temp_df)

# Concatenate all DataFrames
df = pd.concat(df_list, ignore_index=True)
df.to_csv("composition_to_book.csv", index=False)


In [22]:
wikipedia_data

Unnamed: 0,Fragment or scroll identifier,Fragment or scroll name,Alternative identifier,English Bible Association,Language,Date/script,Description,Reference,table_name
0,1QIsaa,Great Isaiah Scroll,,Isaiah 1:1–31; 2:1–22; 3:1–5:30; 6:1–13; 7:1–2...,Hebrew,356–103 BCE/150–100 BCE,Contains all 66 chapters with occasional lacun...,[9][10],Table Cave 1
1,1QIsab,Isaiah,cf. 1Q8,The Book of Isaiah,Hebrew,Hasmonean/Herodian,A second copy of portions of the Book of Isaiah,[11][12],Table Cave 1
2,1QS,"Serekh ha-Yahad or ""Community Rule""",,,Hebrew,,"cf. 4QSa-j = 4Q255–64, 5Q11",[13],Table Cave 1
3,1QpHab,Pesher on Habakkuk,,Habakkuk 1–2,Hebrew,Later half of the 1st century BC,Commentary on Habakkuk 1:2–17; 2:1–20,[14][15],Table Cave 1
4,1QM,Milhamah or War Scroll,,,Hebrew,,"cf. 4Q491, 4Q493; 11Q14?",,Table Cave 1
...,...,...,...,...,...,...,...,...,...
432,Masada Deuteronomy,Deuteronomy,MasDeut,Deuteronomy 33:17–24; 34:2–6,Hebrew,,,,Table Masada
433,Masada Psalmsa,Psalms,MasPsalmsa,"Psalms 81–85, ending with 85:5",Hebrew,,,,Table Masada
434,Masada Psalmsb,Psalms,MasPsalmsb,Psalms 150,Hebrew,,,,Table Masada
435,Masada Ezekiel,Ezekiel,MasEzek,"Ezekiel 35:11–15; 36:1–10, 13–14, 17–35; 37:1–...",Hebrew,,,,Table Masada
