In [1]:
import pandas as pd, numpy as np, os, sys
import matplotlib.pyplot as plt
import seaborn as sns
from funcs import *
from extract_const import DECISION, AFFECTATION, GRADE, grade_norme_dico
from mpf_funcs import check_labat_ids, extract_year, update_grade
from collections import Counter

In [2]:
# CSV des registres matricules de la Seine
df = pd.read_csv('data/primary/big_bertha.csv', sep=',')

# Information geographique
df_other = pd.read_csv('data/primary/edited_geotagged_bertha.csv', sep=',') #On recupure l'information geocodée
df = pd.merge(df, df_other[['pays']], left_index=True, right_index=True)
df = df.apply(update_vienne, axis=1)
df = df.apply(update_colony, axis=1)
df['etranger'] = df['pays'].apply(categorize_country)

# Information anthropologique
df = extract_height(df)
df = extract_prenom(df)
df['age'] = df['Date de naissance'].apply(lambda x: extract_age(x)if isinstance(x, str) else x)
df['année_classe'] = df['num_mat'].apply(lambda x: extract_classe(x) if isinstance(x, str) else x)
df = df.apply(fill_classe, axis=1)
df['année_classe'] = pd.to_numeric(df['année_classe'], errors='coerce')

# Information sociale
df['niveau_dinst'] = df['instruction'].apply(lambda x: extract_inst(x) if isinstance(x, str) else x)
df['inst_mili'] = df['instruction'].apply(lambda x: extract_inst_militaire(x) if isinstance(x, str) else x)
df['junior'] = df.apply(extract_junior, axis=1)
df['famille'] = df.apply(extract_famille, axis=1)
df['permis'] = df.apply(extract_permis, axis=1)
df['criminel'] = df.apply(check_criminel, axis=1)
df['etudiant'] = df.apply(check_etudiant, axis=1)

# Information militaire
df['classement'] = df['decision'].apply(lambda x: extract_decision(x, DECISION) if isinstance(x, str) else x)
df['arme'] = df['affectation'].apply(lambda x: extract_arme(x.lower(), AFFECTATION) if isinstance(x, str) else x)
df['certificat'] = df['details'].apply(lambda x: extract_certificat(x.lower()) if isinstance(x, str) else x)
df['grade'] = df['details'].apply(lambda x: extract_grade(x.lower(), GRADE) if isinstance (x, str) else x)
df['grade'] = df['grade'].map(grade_norme_dico) #We do this depending on whether we want to visualise the ranks
df['promotion'] = df['details'].apply(lambda x: extract_promotion(x.lower(), GRADE) if isinstance (x, str) else x)
df['promotion_list'] = df['details'].apply(lambda x: extract_promotion_list(x.lower(), GRADE) if isinstance (x, str) else x)
df['promotion_start'] = df['promotion'].apply(lambda x: int(x.split('-')[0]) if isinstance(x, str) else x)
df['promotion_end'] = df['promotion'].apply(lambda x: int(x.split('-')[1]) if isinstance(x, str) and '-' in x else x)

# Information disciplinaire
df['insoumis'] = df.apply(check_insoumis, axis=1)
df['conseil'] = df.apply(check_conseil, axis=1)

# Information sur la guerre
df['prisonnier'] = df.apply(check_prisonnier, axis=1)
df['mitrailleur'] = df.apply(check_mitrailleur, axis=1)
df['mort'] = df['details'].apply(lambda x: extract_mort(x.lower()) if isinstance(x, str) else x)
df['blesse'] = df.apply(check_blessure, axis=1)
df = find_blessure(df)

# Information sur la post-guerre
df['pension'] = df['details'].apply(lambda x: extract_pension(x) if isinstance(x, str) else x)
df['pension'] = df['pension'].apply(lambda x: clean_pension(x))
df['reforme'] = df['details'].apply(lambda x: find_reforme(x) if isinstance(x, str) else x)
df['citation'] = df.apply(check_citation, axis=1)
df['legion dhonneur'] = df.apply(check_legion_dhonneur, axis=1)
df['medailles'] = df.apply(check_medals, axis=1)

# On rajoute les informations de la base MPF
merged_df = pd.read_csv('data/primary/merged_mpf.csv', sep=',')
merged_df = merged_df[['labat_id', 'id_mention_intitule', 'classe', 'recrutement_matricule', 'id_recrutement_bureau_intitule',
                  'id_grade_intitule', 'id_unite_intitule', 'deces_jour_mois_annee', 'id_deces_lieu_intitule',
                  'id_deces_departement_intitule', 'id_deces_pays_intitule']]
merged_df.columns = ['mpf_' + col if col != 'labat_id' else col for col in merged_df.columns]
check_labat_ids(merged_df, df)
merged_df['labat_id'] = merged_df['labat_id'].astype(df['labat_id'].dtype)
df = df.join(merged_df.set_index('labat_id'), on='labat_id', how='left')

# On nettoie les données de la base MPF
df['mpf_classe_cleaned'] = df['mpf_classe'].apply(lambda x: extract_year(x) if isinstance(x, str) else x)
df['mpf_classe_cleaned'] = pd.to_numeric(df['mpf_classe_cleaned'], errors='coerce')
df['année_classe'] = df.apply(lambda row: row['mpf_classe_cleaned'] if pd.notna(row['mpf_classe_cleaned']) else row['année_classe'], axis=1)

# On met a jour les grades avec la base MPF
df['mpf_id_grade_intitule'] = df['mpf_id_grade_intitule'].apply(lambda x: 'soldat de 2e classe' if x == 'soldat' else x)
df['mpf_grade_normalized'] = df['mpf_id_grade_intitule'].apply(lambda x: extract_grade(x.lower(), GRADE) if isinstance (x, str) else x)
df['mpf_grade_normalized'] = df['mpf_grade_normalized'].map(grade_norme_dico)
df = update_grade(df)

# Separation des etrangers
foreign = df[(df['etranger'] == "Etranger") | (df['famille'] == "etranger")]
foreign.dropna(subset=['Lieu de naissance'], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  foreign.dropna(subset=['Lieu de naissance'], inplace=True)


In [3]:
df.to_csv("data/primary/final_df.csv", sep=',', index=False)