In [51]:
import pandas as pd
import os
import pathlib as Path
import matplotlib.pyplot as plt
import glob
import ast
import re

In [11]:

# Path to your results folder
path = os.path.join('..', 'results')  # This navigates from notebooks to results folder

# Get all CSV files in the folder
all_files = glob.glob(os.path.join(path, "*.csv"))

# Create an empty list to store individual dataframes
dfs = []

# Read each CSV file and append to the list
for file in all_files:
    df_temp = pd.read_csv(file)
    # Optional: Add a column to track which file the data came from
    df_temp['source_file'] = os.path.basename(file)
    dfs.append(df_temp)

# Combine all dataframes into one
combined_df = pd.concat(dfs, ignore_index=True)

# Check the result
print(f"Total number of files processed: {len(all_files)}")
print(f"Shape of combined DataFrame: {combined_df.shape}")
print("\nColumns in combined DataFrame:")
print(combined_df.columns.tolist())

Total number of files processed: 49
Shape of combined DataFrame: (397495, 17)

Columns in combined DataFrame:
['person_id', 'name', 'gender', 'religion_ethnicity', 'social_status_job', 'role_in_case', 'titles', 'date', 'calendar', 'place_name', 'place_type', 'legal_case_type', 'case_result', 'case_result_type', 'row_index', 'case_unique_id', 'source_file']


In [23]:
result = combined_df['source_file'].unique()
result = list(result)
len(result)

49

In [24]:
combined_df['gender'].value_counts()

gender
Man      232512
Woman     22949
Name: count, dtype: int64

In [25]:
combined_df['legal_case_type'].value_counts()

legal_case_type
Property Disputes               6178
Debt/Loan Agreements            5448
Inheritance Cases               4421
Slavery and Manumission         2389
Criminal Cases                  2142
                                ... 
Lease Agreement                    1
Guardianship and supervision       1
Tebennî                            1
Gift                               1
Medical Liability                  1
Name: count, Length: 73, dtype: int64

In [25]:
combined_df['legal_case_type'].value_counts()

legal_case_type
Property Disputes               6178
Debt/Loan Agreements            5448
Inheritance Cases               4421
Slavery and Manumission         2389
Criminal Cases                  2142
                                ... 
Lease Agreement                    1
Guardianship and supervision       1
Tebennî                            1
Gift                               1
Medical Liability                  1
Name: count, Length: 73, dtype: int64

In [27]:
pd.set_option('display.max_rows', None)

In [50]:
social_status_counts = combined_df['social_status_job'].value_counts()

In [30]:
combined_df.head()

Unnamed: 0,person_id,name,gender,religion_ethnicity,social_status_job,role_in_case,titles,date,calendar,place_name,place_type,legal_case_type,case_result,case_result_type,row_index,case_unique_id,source_file
0,beşiktaş mahkemesi_2_hüküm no: 1_1,Mehmed Bey b. Abdullah,Man,Muslim,,Borrower,['Bey'],,,,,,,,9516,beşiktaş mahkemesi_2_hüküm no: 1,Beşiktaş Mahkemesi_2.csv
1,beşiktaş mahkemesi_2_hüküm no: 1_2,Keyvan Kethüdâ b. Abdurrahman,Man,Muslim,,Buyer,['Kethüdâ'],,,,,,,,9516,beşiktaş mahkemesi_2_hüküm no: 1,Beşiktaş Mahkemesi_2.csv
2,beşiktaş mahkemesi_2_hüküm no: 1_3,Devlet Hatun,Woman,Muslim,,Neighbor,[],,,,,,,,9516,beşiktaş mahkemesi_2_hüküm no: 1,Beşiktaş Mahkemesi_2.csv
3,beşiktaş mahkemesi_2_hüküm no: 1_4,Muharrem Usta,Man,Muslim,Artisan/Craftsperson,Neighbor,['Usta'],,,,,,,,9516,beşiktaş mahkemesi_2_hüküm no: 1,Beşiktaş Mahkemesi_2.csv
4,beşiktaş mahkemesi_2_hüküm no: 1_5,Pîr b. Hürrem,Man,Muslim,,Neighbor,[],,,,,,,,9516,beşiktaş mahkemesi_2_hüküm no: 1,Beşiktaş Mahkemesi_2.csv


In [39]:
#Unique titles
unique_titles = []
for titles_str in combined_df['titles'].dropna():
    titles = ast.literal_eval(titles_str)
    unique_titles.extend(titles)
unique_titles = sorted(set(unique_titles))

In [42]:
len(unique_titles)

5211

In [55]:
def normalize_characters(text):
    # Dictionary of character replacements
    replacements = {
        'â': 'a', 'ā': 'a', 'á': 'a', 'à': 'a',
        'î': 'i', 'ī': 'i', 'í': 'i', 'ì': 'i',
        'û': 'u', 'ū': 'u', 'ú': 'u', 'ù': 'u',
        'ô': 'o', 'ō': 'o', 'ó': 'o', 'ò': 'o',
        'ê': 'e', 'ē': 'e', 'é': 'e', 'è': 'e',
        'ḫ': 'h',
        'ḥ': 'h',
        'ṣ': 's',
        'ṭ': 't'
    }
    
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text

def clean_title_advanced(title):
    # Initial basic cleaning
    title = title.lower().strip()
    title = normalize_characters(title)

    # Remove common variations and standardize
    # Remove 'el-' or 'el ' prefix which is common in Arabic names
    if title.startswith('el-') or title.startswith('el '):
        title = title[3:]
    
    # Remove parentheses and their contents
    title = re.sub(r'\(.*?\)', '', title)
    
    # Remove numbers
    title = re.sub(r'\d+', '', title)
    
    # Final cleanup of any extra spaces
    title = ' '.join(title.split())
    
    return title

# Apply advanced cleaning
cleaned_titles_advanced = [clean_title_advanced(title) for title in unique_titles]
cleaned_unique_titles_advanced = sorted(set(cleaned_titles_advanced))

print(f"Original number of titles: {len(unique_titles)}")
print(f"Number of titles after advanced cleaning: {len(cleaned_unique_titles_advanced)}")

Original number of titles: 5211
Number of titles after advanced cleaning: 3984


In [66]:
print(f"Number of unique social status values: {len(combined_df['social_status_job'].unique())}")

Number of unique social status values: 1078


In [74]:
print(combined_df['social_status_job'].value_counts(dropna=True).sort_index())

social_status_job
40th Yaya Bölüğü Yaya                              1
Abacı                                              1
Accountant                                        10
Accountant of Waqfs                                1
Acemi Oğlan                                        1
Acemi Oğlanı                                       4
Acemi oğlanı                                       3
Acemioğlan                                         2
Acemioğlanı                                        5
Agent                                              2
Akideci (Trader/Merchant)                          1
Ambassador                                         2
Amil                                               1
Anadolu Ağası                                      1
Apprentice                                         4
Arab Ağa                                           1
Arabacı                                            1
Architect                                        215
Armaşi                      

In [76]:
categories = {
    'Sufi': ['Sufi', 'Sufi Leader', 'Derviş', 'Şeyh', 'Dede', 'Baba', 'Mevlevi', 'Bektaşi', 'Nakşibendi', 'Halveti'],
    
    'Religious': ['Scholar/Ulama', 'Imam', 'Müezzin', 'Religious Leader', 'Clergy', 'Priest', 'Bishop', 'Metropolitan', 'Hatib', 'Vaiz'],
    
    'Military': ['Janissary', 'Soldier', 'Military', 'Sipahi', 'Commander', 'Military Officer', 'Military Official', 'Cündî', 'Serdengeçdi'],
    
    'Administrative': ['Bureaucrat', 'Judge', 'Scribe', 'Tax Collector', 'Governor', 'Trustee', 'Kethüda', 'Steward', 'Kadı', 'Nâib', 'Mütevellî'
                      'Treasurer'],
    
    'Commercial': ['Trader/Merchant', 'Money Changer', 'Sarraf', 'Merchant', 'Bakkal', 'Attâr'],
    
    'Artisanal': ['Artisan/Craftsperson', 'Carpenter', 'Blacksmith', 'Baker', 'Butcher', 'Tailor', 'Debbağ', 'Sarrâc'],
    
    'Service': ['Servant', 'Porter', 'Guard', 'Doorman', 'Gatekeeper', 'Cook', 'Bevvâb', 'Hamal', 'Tabbâh'],
    
    'Maritime': ['Sailor', 'Ship Captain', 'Captain', 'Boatman', 'Reis', 'Kayıkçı'],
    
    'Agricultural': ['Farmer/Agriculturist', 'Shepherd', 'Gardener', 'Bostancı'],
    
    'Professional': ['Physician/Healer', 'Architect', 'Engineer', 'Teacher', 'Translator', 'Muallim'],
    
    'Social Status': ['Slave/Enslaved Person', 'Freed Slave', 'Sultan', 'Royalty', 'Minor', 'Orphan', 'Müdebber']
}

In [77]:
# Create new column with lowercase values first
combined_df['cleaned_social_status'] = combined_df['social_status_job'].str.lower()

In [79]:
def standardize_social_status(value):
    if pd.isna(value):
        return value
    
    # Convert to lowercase
    value = value.lower().strip()
    
    # Standardize specific cases
    standardization_map = {
        'acemi oğlan': 'military',
        'acemi oğlanı': 'military',
        'acemioğlan': 'military',
        'acemioğlanı': 'military',
        '40th yaya bölüğü yaya': 'military',
        'abacı': 'artisanal',
        'accountant': 'administrative',
        'accountant of waqfs' : 'administrative',
        'agent' : 'administrative',
        'akideci (trader/merchant)' : 'artisanal',
        'ambassador' : 'administrative',
        'amil' : 'religious',
        'anadolu ağası' : 'military',
        'apprentice' : 'artisanal',
        'arab ağa' : 'military',
        'arabacı' : 'artisanal',
        'architect' : 'professional',
        'armaşi' : 'administrative',
        'arpa emîni' : 'administrative',
        'arpacı' : 'artisanal',
        'artisan/craftsperson' : 'artisanal',
        'artisan/craftsperson (grocer)' : 'artisanal',
        'artisan/craftsperson (perfumer)' : 'artisanal',
        'ases': 'military',
        'asesbaşı' : 'military',
        'asesbaşı (chief of night watchmen)': 'military',
        'asesler kethüdâsı' : 'military',
        'askerî kassâm' : 'military',
        'assistant' : 'administrative',
        'attar' : 'artisanal',
        'attâr' : 'artisanal',
        'auctioneer' : 'artisanal',
        'azeb' : 'military',
        'ağa' : 'military',
        'ağa of the saadetlu kaymakam pasha',
        'aşçı' : 'service',
        'aşçı-i sultânî' : 'service',
        'aşçıbaşı' : 'service',
        'bahriye kâtibi' : 'administrative',
        'bahçevân' : 'service',
        'bakara-hân' : 'religious',
        'baker' : 'artisanal',
        'bakkal' : 'artisanal',
        'bakkal (grocer)' : 'artisanal',
        'bakkalân pazarbaşı' : 'artisanal',
        'baltacı' : 'artisanal',
        'baltacılar kethudası' : 'artisanal',
        'ban' : 'artisanal',
        'barber' : 'artisanal',
        'barber and grocer' : 'artisanal',
        'bathhouse attendant' : 'artisanal',
        'bathhouse keeper' : 'artisanal',
        'bayrakdâr' : 'military',
        'bayraktar' : 'military',
        'baytar' : 'service',
        'baş defterdâr' : 'administrative',
        'başbâkīkulu' : 'administrative',
        'başkorucu' : 'military',
        'baştezkireci' : 'military',
        'başyedekçi' : 'military',
        'bekçi' : 'military',
        'bevvâb' : 'military',
        'bevvâb (doorkeeper)' : 'military',
        'bevvâb (gatekeeper)' : 'military',
        'bevvâb-ı sultanî' : 'military',
        'bevvâb-ı sultanî (gatekeeper)' : 'military',
        'bevvâb-ı sultânî' : 'military',
        'bevvâbü dârü’s-saâde' : 'military',
        'bevvâbü’s-sultânî' : 'military',
        'bey' : 'administrative',
        'bey of antakya' : 'administrative',
        'beylerbeyi' : 'administrative',
        'beytülmal emîni' : 'administrative',
        'beytülmal officer' : 'administrative',
        'beytülmal-i hassa emîni' : 'administrative',
        'beytülmâl' : 'administrative',
        'beytülmâl emin' : 'administrative',
        'beytülmâl emîni' : 'administrative',
        'beytülmâl emîni (treasurer)' : 'administrative',
        'beytülmâl emîni (treasury official)' : 'administrative',
        'beytülmâl emîni (treasury trustee)' : 'administrative',
        'beytülmâl mültezimi' : 'administrative',
        'beytülmâl-i hâs emîni' : 'administrative',
        'beytülmâl-i hâssı emîni' : 'administrative',
        'beytülmâl-i mezbûrun nâzırı' : 'administrative',
        'beytülmâl-i âmme emîni' : 'administrative',
        'beytülmâl-i âmmın emîni' : 'administrative',
        'beytü’l-mâl emîni' : 'administrative',
        'beytü’l-mâl officer' : 'administrative',
        'bezci' : 'artisanal',
        'bezzâzistân kethüdâsı' : 'artisanal',
        'bishop' : 'religious',
        'blacksmith' : 'artisanal',
        'boatman' : 'artisanal',
        'boatmen guild leader' : 'artisanal',
        'bookbinder' : 'artisanal',
        'bostancı' : 'military',
        'bostancı kethüdâsı' : 'military',
        'bostancıbaşı' : 'military',
        'bostancılar kethüdâsı başhasekisi' : 'military',
        'bostancılar odabaşısı' : 'military',
        'bostânî' : 'military',
        'bostânî (gardener)' : 'military',
        'bozacı' : 'artisanal',
        'bureaucrat' : 'administrative',
        'bureaucrat (former judge)' : 'administrative',
        'bursa kadısı' : 'administrative',
        'butcher' : 'artisanal',
        'butcher guild steward' : 'artisanal',
        'button maker' : 'artisanal',
        'buzcu' : 'artisanal',
        'bâb-ı sa‘âdet bevvâb' : 'military',
        'bölükbaşı' : 'military',
        'börekçi' : 'artisanal',
        'cabi' : 'administrative',
        'calligrapher' : 'artisanal',
        'camel driver' : 'artisanal',
        'candle maker guild steward' : 'artisanal',
        'captain' : 'military',
        'caretaker' : 'artisanal',
        'carpenter' : 'artisanal',
        'carriage driver' : 'artisanal',
        'cebeci' : 'military',
        'cebecibaşı' : 'military',
        'celeb tâ’ifesinden' : 'artisanal',
        'cerrâh (surgeon)' : 'artisanal',
        'chef' : 'artisanal',
        'chief butler' : 'artisanal',
        'chief clerk' : 'administrative',
        'chief of bailiffs' : 'administrative',
        'chief usher' : 'administrative',
        'city officer' : 'administrative',
        'cizye officer' : 'administrative',
        'clergy' : 'religious',
        'clergyman' : 'religious',
        'cleric' : 'administrative',
        'clockmaker' : 'artisanal',
        'collector' : 'administrative',
        'commander' : 'military',
        'community leader' : 'administrative',
        'convert' : 'religious',
        'cook' : 'artisanal',
        'cook at the new imperial palace' : 'artisanal',
        'cotton carder' : 'artisanal',
        'court clerk' : 'administrative',
        'court messenger' : 'administrative',
        'court officer' : 'administrative',                                     
        'court official' : 'administrative',                                     
        'court scribe' : 'administrative', 
        'court servant' : 'administrative', 
        'court summoner' : 'administrative',                     
        'court trustee' : 'administrative', 
        'crafts guild member' : 'artisanal',
        'crafts guild member (carpenter)' : 'artisanal',
        'craftsman' : 'artisanal',
        'craftsperson' : 'artisanal', 
        'crier' : 'artisanal',        
        'current sâhib-i ayâr' : 'administrative',
        'customs officer' : 'administrative',
        'customs official' : 'administrative',
        'câbi' : 'administrative',
        'câbi (tax collector)' : 'administrative',
        'câbi and kāimmakām-ı mütevellî' : 'administrative',
        'câbî' : 'administrative',
        'câbî (collector)' : 'administrative',
        'câbî and kāimmakām-ı mütevellî' : 'administrative',
        'câbî-i evkāf' : 'administrative',
        'câbî-i kârbânsaray' : 'administrative',
        'câbî-i vakf' : 'administrative',
        'câbî-i âsiyâb' : 'administrative',
        'cündi' : 'military',                                             
        'cündî' : 'military',                                            
        'cündî (military personnel)' : 'military',                         
        'cündî (soldier)' : 'military',                                  
        'cüzhân' : 'artisanal',                                          
        'danişmend' : 'military',                                         
        'davud paşa mütevellîsi' : 'administrative',                             
        'debbağ' : 'artisanal',                                             
        'debbağ (tanner)' : 'artisanal',                                   
        'defterdar' : 'administrative',                                          
        'defterdâr' : 'administrative',                                          
        'dellâl' : 'administrative',                                             
        'dellâl (auctioneer)' : 'administrative',                                
        'demircibaşı' : 'artisanal',                                        
        'deputy' : 'religious',                                             
        'deputy imam' : 'religious',                                        
        'deputy imam and teacher' : 'religious',                           
        'deputy judge' : 'administrative',                                       
        'deputy trustee' : 'administrative',                                     
        'dergâh-ı âlî solak': 'sufi',                                 
        'dergâh-ı âlî çorbacı' : 'artisanal',                               
        'derviş' : 'sufi',                                             
        'devir-hân' : 'sufi',                                          
        'diplomat/envoy' : 'administrative',                                    
        'divan kâtibi' : 'administrative',                                       
        'doorkeeper' : 'artisanal',                                         
        'doorman' : 'artisanal',                                           
        'driver' : 'artisanal',                                             
        'dârü’s-sa‘âde ağa' : 'administrative',                                  
        'dârü’ş-şifâ ağa' : 'administrative',                                    
        'dîvân efendisi' : 'administrative',                                     
        'dîvân-ı âlî kâtibi' : 'administrative',                                 
        'düğmeci' : 'artisanal',                                            
        'eczâ-hân' : 'artisanal',                                           
        'eczâ-hân (pharmacist)' : 'artisanal',                             
        'eflak voyvoda' : 'administrative',                                      
        'ehl-i hibre' : 'artisanal',                                        
        'ekmekci' : 'artisanal',                                            
        'ekmekçiler kethüdâsı' : 'artisanal',                               
        'el-bevvâb' : 'artisanal',                                          
        'el-câbî' : 'artisanal',                                            
        'el-cündî' : 'artisanal',                                           
        'el-kâtib' : 'administrative',                                           
        'el-muhzır' : 'administrative',                                         
        'el-mübâşir' : 'administrative',                                         
        'el-mülâzım' : 'administrative',                                         
        'el-münâdî' : 'administrative',                                          
        'elder' : 'administrative',                                              
        'emekdar' : 'artisanal',                                           
        'emin' : 'administrative',                                               
        'emin-i beytü’l-mâl' : 'administrative',                                 
        'emir-i ahur' : 'military',                                        
        'emir-i alay' : 'military',                                        
        'emânet-i beytü’l-mâl zâbiti' : 'administrative',                        
        'emîn' : 'administrative',                                               
        'emîn-i beytülmâl' : 'administrative',                                   
        'emîn-i beytülmâl-i hâs' : 'administrative',                             
        'emîn-i darbhâne' : 'administrative',                                    
        'emîn-i defterî' : 'administrative',                                     
        'emîn-i matbah-ı mahmud paşa' : 'administrative',                        
        'emîni and mültezimi' : 'administrative',                                
        'engineer' : 'artisanal',                                           
        'enslaved person' : 'slavery',                                   
        'enslaved person (formerly)' : 'slavery',                         
        'erbâb-ı timar' : 'administrative',                                      
        'erbâb-ı tımar' : 'administrative',                                      
        'es-silâhî' : 'military',                                          
        'esirci (slave trader)' : 'slavery',                              
        'esîrci' : 'slavery',                                             
        'executor' : 'administrative',                                           
        'executor of will' : 'administrative',                                   
        'expert witness' : 'administrative',                                     
        'ez-zevvâk' : 'artisanal',   
        'fahrü’l-a‘yân' : 'administrative',                                      
        'fahrü’l-cuyûş' : 'military',                                      
        'fahrü’l-cüyûş' : 'military',                                      
        'fahrü’l-eimme' : 'administrative',                                      
        'fahrü’l-kuzât' : 'administrative',                                      
        'fahrü’l-küttâb' : 'administrative',                                     
        'fahrü’l-muhadderât' : 'administrative',                                 
        'fahrü’l-müderrisîn' : 'religious',                                 
        'fahrü’l-ümenâ' : 'administrative',                                      
        'fahrü’l-ümerâ' : 'administrative',                                      
        'fahrü’ş-şüyûh' : 'religious',                                      
        'farm manager' : 'farming',                                       
        'farmer' : 'farming',                                              
        'farmer/agriculturist' : 'farming',                             
        'ferrâş' : 'artisanal',                                             
        'ferrâş (caretaker)' : 'artisanal',                                 
        'ferrâş (caretaker/sweeper)' : 'artisanal',                         
        'fethiye i̇mamı' : 'religious',                                     
        'fisherman' : 'artisanal',                                          
        'flower seller' : 'artisanal',                                      
        'foot soldier' : 'military',                                       
        'former anatolian kadıaskeri' : 'administrative',                        
        'former bey of peçuy' : 'administrative',                                
        'former beytü’l-mâl emîni' : 'administrative',                          
        'former bosnian judge' : 'administrative',                               
        'former bostancıbaşı' : 'military',                                
        'former collector' : 'administrative',                                   
        'former customs officer' : 'administrative',                             
        'former fish commissioner' : 'administrative',                           
        'former grocer' : 'artisanal',                                      
        'former imam' : 'religious',                                        
        'former inspector' : 'administrative',                                   
        'former janissary' : 'military',                                   
        'former janissary kethüda' : 'military',                           
        'former judge' : 'administrative',                                       
        'former judge of aleppo' : 'administrative',                             
        'former judge of baghdad' : 'administrative',                            
        'former judge of damascus' : 'administrative',                           
        'former judge of ebâ eyyûb-i ensârî' : 'administrative',                 
        'former judge of erzurum' : 'administrative',                            
        'former judge of kuds-i şerîf' : 'administrative',                       
        'former judge of mahmiye-i galata' : 'administrative',                   
        'former judge of manisa' : 'administrative',                             
        'former judge of medina' : 'administrative',                             
        'former judge of mekke-i mükerreme' : 'administrative',                  
        'former judge of tavas' : 'administrative',                              
        'former kadı' : 'administrative',                                        
        'former kadı of yenişehir' : 'administrative',                           
        'former maraş kadı' : 'administrative',                                  
        'former military judge' : 'administrative',                              
        'former mütesellim' : 'administrative',                                  
        'former qadi of jerusalem' : 'administrative',                          
        'former rumeli kazasker' : 'administrative',                             
        'former ship captain' : 'military',                                
        'former slave' : 'slavery',                                      
        'former sofya mütesellim' : 'administrative',                            
        'former sürekçi, current meyhâneci' : 'artisanal',                  
        'former tokat judge' : 'administrative',                                 
        'former trustee' : 'administrative',                                    
        'former turnacıbaşı' : 'artisanal',                                 
        'former voivode of boğdan' : 'administrative',                           
        'former voivode of moldavia' : 'administrative',                         
        'former waqf trustee' : 'administrative',                                
        'formerly enslaved person' : 'slavery',                           
        'freed person' : 'slavery',                                       
        'freed slave' : 'slavery',                                       
        'freedman' : 'slavery',                                           
        'freedwoman' : 'slavery',                                         
        'furrier' : 'artisanal', 
        

        # We can add more cases as we find them
    }
    
    return standardization_map.get(value, value)

In [78]:
# Check some examples of duplicates that should be the same
duplicates_check = (combined_df['cleaned_social_status']
                   .value_counts()
                   .sort_index())
print("Sample of values that need to be standardized:")
print(duplicates_check)

Sample of values that need to be standardized:
cleaned_social_status
40th yaya bölüğü yaya                              1
abacı                                              1
accountant                                        10
accountant of waqfs                                1
acemi oğlan                                        1
acemi oğlanı                                       7
acemioğlan                                         2
acemioğlanı                                        5
agent                                              2
akideci (trader/merchant)                          1
ambassador                                         2
amil                                               1
anadolu ağası                                      1
apprentice                                         4
arab ağa                                           1
arabacı                                            1
architect                                        215
armaşi                        