In [1]:
import pandas as pd 
import geopandas as gpd 
from shapely.geometry import Point
import numpy as np 
import jellyfish

In [2]:
# Load Communes 48 CSV Features 
communes48_csv_df = pd.read_csv('communes_48.csv')
communes48_csv_df.head()

Unnamed: 0,num,code,code_5,nom,nom_maj,code_wil
0,1,101,1001,Adrar,ADRAR,1
1,2,102,1002,Tamest,TAMEST,1
2,3,103,1003,Charouine,GHAROUINE,1
3,4,104,1004,Reggane,REGGANE,1
4,5,105,1005,Inzegmir,INZEGMIR,1


In [3]:
# Load Communes 58 CSV Features 
communes58_csv_df = pd.read_csv('communes_58.csv')
communes58_csv_df.head()

Unnamed: 0,code_wil,code_48,code_58,nom,nom_maj,nom_ar
0,1,101,101,Adrar,ADRAR,أدرار
1,1,115,102,Fenoghil,FENOGHIL,فنوغيل
2,1,102,103,Tamest,TAMEST,تاماست
3,1,104,104,Reggane,REGGANE,رقان
4,1,118,105,Sali,SALI,سالي


In [4]:
# Load Communes GeoJSON Features 
communes_json_gdf = gpd.read_file('communes_48.geojson')
communes_json_gdf.head()

Unnamed: 0,code,code_5,nom,nom_maj,code_wil,geometry
0,101,1001,Adrar,ADRAR,1,"MULTIPOLYGON (((-0.22565 28.08656, -0.04751 28..."
1,102,1002,Tamest,TAMEST,1,"MULTIPOLYGON (((-2.98197 27.50450, -2.49629 27..."
2,103,1003,Charouine,GHAROUINE,1,"MULTIPOLYGON (((-0.16269 28.96347, -0.21556 28..."
3,104,1004,Reggane,REGGANE,1,"MULTIPOLYGON (((0.51798 26.92294, 0.90476 26.2..."
4,105,1005,Inzegmir,INZEGMIR,1,"MULTIPOLYGON (((0.47629 27.06526, 0.28392 27.0..."


In [5]:
# Merge Communes 48 and 58 
merged_df = communes58_csv_df.merge(communes48_csv_df, left_on='code_48', right_on='code', how='left')
merged_df.count()

code_wil_x    1541
code_48       1541
code_58       1541
nom_x         1541
nom_maj_x     1541
nom_ar         202
num           1541
code          1541
code_5        1541
nom_y         1541
nom_maj_y     1541
code_wil_y    1541
dtype: int64

In [6]:
merged_df

Unnamed: 0,code_wil_x,code_48,code_58,nom_x,nom_maj_x,nom_ar,num,code,code_5,nom_y,nom_maj_y,code_wil_y
0,1,101,101,Adrar,ADRAR,أدرار,1,101,1001,Adrar,ADRAR,1
1,1,115,102,Fenoghil,FENOGHIL,فنوغيل,15,115,1015,Fenoughil,FENOUGHIL,1
2,1,102,103,Tamest,TAMEST,تاماست,2,102,1002,Tamest,TAMEST,1
3,1,104,104,Reggane,REGGANE,رقان,4,104,1004,Reggane,REGGANE,1
4,1,118,105,Sali,SALI,سالي,18,118,1018,Sali,SALI,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1536,57,3924,5707,Tendla,TENDLA,تندلة,1301,3924,39024,Tendla,TENDLA,39
1537,57,3922,5708,M'Rara.,M'RARA.,مرارة,1299,3922,39022,Mrara,MRARA,39
1538,58,4702,5801,El Meniaâ,EL MENIAÂ,المنيعة,1492,4702,47002,El Meniaa,EL MENIAA,47
1539,58,4712,5802,Hassi Gara,HASSI GARA,حاسي القارة,1502,4712,47012,Hassi El Garaa,HASSI EL GARAA,47


In [7]:
# Checking for similarities between communes names
merged_df['similarity'] = merged_df.apply(lambda row: jellyfish.levenshtein_distance(str(row.nom_x).lower(), str(row.nom_y).lower()), axis=1)
merged_df[merged_df.similarity > 2]

Unnamed: 0,code_wil_x,code_48,code_58,nom_x,nom_maj_x,nom_ar,num,code,code_5,nom_y,nom_maj_y,code_wil_y,similarity
5,1,105,106,In Zghmir,IN ZGHMIR,إن زغمير,5,105,1005,Inzegmir,INZEGMIR,1,3
1443,47,4713,4710,Mansoura,MANSOURA,المنصورة,1503,4713,47013,El Mansoura,EL MANSOURA,47,3
1498,51,710,5105,Chaïba,CHAÏBA,الشعيبة,239,710,7010,Ech Chaiba,ECH CHAIBA,7,5
1511,53,1110,5302,Foggaret Ezzaouia,FOGGARET EZZAOUIA,فقارت الزاوية,363,1110,11010,Foggarat Ez Zouaia,FOUGGARAT EZ ZOUAIA,11,4
1529,56,3305,5602,Bordj El Haouasse,BORDJ EL HAOUASSE,برج الحواس,1162,3305,33005,Bordj El Haoues,BORDJ EL HAOUES,33,3
1530,57,3927,5701,El Megaier,EL MEGAIER,المغير,1304,3927,39027,El Mghair,EL MGHAIR,39,3
1531,57,3929,5702,Oum Touyour,OUM TOUYOUR,أم الطيور,1306,3929,39029,Oum Tiour,OUM TIOUR,39,3
1539,58,4712,5802,Hassi Gara,HASSI GARA,حاسي القارة,1502,4712,47012,Hassi El Garaa,HASSI EL GARAA,47,4


In [8]:
# Merge Communes 48 and 58 
merged_gdf = communes58_csv_df.merge(communes_json_gdf, left_on='code_48', right_on='code', how='left')
merged_gdf.count()

code_wil_x    1541
code_48       1541
code_58       1541
nom_x         1541
nom_maj_x     1541
nom_ar         202
code          1541
code_5        1541
nom_y         1541
nom_maj_y     1541
code_wil_y    1541
geometry      1541
dtype: int64

In [9]:
# Checking for similarities between communes names
merged_gdf['similarity'] = merged_gdf.apply(lambda row: jellyfish.levenshtein_distance(str(row.nom_x).lower(), str(row.nom_y).lower()), axis=1)
merged_gdf[merged_gdf.similarity > 2]

Unnamed: 0,code_wil_x,code_48,code_58,nom_x,nom_maj_x,nom_ar,code,code_5,nom_y,nom_maj_y,code_wil_y,geometry,similarity
5,1,105,106,In Zghmir,IN ZGHMIR,إن زغمير,105,1005,Inzegmir,INZEGMIR,1,"MULTIPOLYGON (((0.47629 27.06526, 0.28392 27.0...",3
1443,47,4713,4710,Mansoura,MANSOURA,المنصورة,4713,47013,El Mansoura,EL MANSOURA,47,"MULTIPOLYGON (((2.27777 32.54921, 2.34106 32.4...",3
1498,51,710,5105,Chaïba,CHAÏBA,الشعيبة,710,7010,Ech Chaiba,ECH CHAIBA,7,"MULTIPOLYGON (((4.92603 34.86042, 4.92670 34.8...",5
1511,53,1110,5302,Foggaret Ezzaouia,FOGGARET EZZAOUIA,فقارت الزاوية,1110,11010,Foggarat Ez Zouaia,FOUGGARAT EZ ZOUAIA,11,"MULTIPOLYGON (((5.72264 29.75771, 5.72269 29.6...",4
1529,56,3305,5602,Bordj El Haouasse,BORDJ EL HAOUASSE,برج الحواس,3305,33005,Bordj El Haoues,BORDJ EL HAOUES,33,"MULTIPOLYGON (((9.05073 25.14706, 9.01148 24.9...",3
1530,57,3927,5701,El Megaier,EL MEGAIER,المغير,3927,39027,El Mghair,EL MGHAIR,39,"MULTIPOLYGON (((5.27105 33.97620, 5.27604 33.9...",3
1531,57,3929,5702,Oum Touyour,OUM TOUYOUR,أم الطيور,3929,39029,Oum Tiour,OUM TIOUR,39,"MULTIPOLYGON (((5.18354 34.41286, 5.18422 34.4...",3
1539,58,4712,5802,Hassi Gara,HASSI GARA,حاسي القارة,4712,47012,Hassi El Garaa,HASSI EL GARAA,47,"MULTIPOLYGON (((4.03471 30.60033, 4.03410 30.5...",4


In [10]:
# Reorder and rename columns
communes_gdf = merged_gdf.drop(columns=['code', 'nom_y', 'nom_maj_y', 'code_wil_y', 'similarity'])
communes_gdf.rename(columns={"code_wil_x": "code_wil", 'nom_x' : 'nom', 'nom_maj_x' : 'nom_maj'}, inplace=True)
communes_gdf = communes_gdf[['code_48', 'code_58', 'code_5', 'nom', 'nom_maj', 'nom_ar', 'code_wil', 'geometry']]
communes_gdf.head()

Unnamed: 0,code_48,code_58,code_5,nom,nom_maj,nom_ar,code_wil,geometry
0,101,101,1001,Adrar,ADRAR,أدرار,1,"MULTIPOLYGON (((-0.22565 28.08656, -0.04751 28..."
1,115,102,1015,Fenoghil,FENOGHIL,فنوغيل,1,"MULTIPOLYGON (((-2.91984 27.69890, -2.49431 27..."
2,102,103,1002,Tamest,TAMEST,تاماست,1,"MULTIPOLYGON (((-2.98197 27.50450, -2.49629 27..."
3,104,104,1004,Reggane,REGGANE,رقان,1,"MULTIPOLYGON (((0.51798 26.92294, 0.90476 26.2..."
4,118,105,1018,Sali,SALI,سالي,1,"MULTIPOLYGON (((0.47629 27.06526, 0.51713 26.9..."


In [11]:
# Sort the DataFrame by code and reset the index
communes_gdf = communes_gdf.sort_values(by=['code_58'],ignore_index=True)
communes_gdf.reset_index(drop=True)
communes_gdf.head()

Unnamed: 0,code_48,code_58,code_5,nom,nom_maj,nom_ar,code_wil,geometry
0,101,101,1001,Adrar,ADRAR,أدرار,1,"MULTIPOLYGON (((-0.22565 28.08656, -0.04751 28..."
1,115,102,1015,Fenoghil,FENOGHIL,فنوغيل,1,"MULTIPOLYGON (((-2.91984 27.69890, -2.49431 27..."
2,102,103,1002,Tamest,TAMEST,تاماست,1,"MULTIPOLYGON (((-2.98197 27.50450, -2.49629 27..."
3,104,104,1004,Reggane,REGGANE,رقان,1,"MULTIPOLYGON (((0.51798 26.92294, 0.90476 26.2..."
4,118,105,1018,Sali,SALI,سالي,1,"MULTIPOLYGON (((0.47629 27.06526, 0.51713 26.9..."


In [12]:
# Write to GeoJSON and CSV files
communes_gdf.index = np.arange(1, len(communes_gdf) + 1)
gpd.GeoDataFrame(communes_gdf).to_file("communes_58.geojson", driver='GeoJSON')