In [1]:
from scipy.stats import kstest
from scipy import stats
import pandas as pd
import numpy as np
import seaborn as sns
import chardet
import pycountry
import csv
from pathlib import Path
from all_functions import refactorGiniWorldBankDf

In [2]:
revenuMondial = pd.read_csv("../data/data-projet7.csv", parse_dates=True)
revenuMondial

Unnamed: 0,country,year_survey,quantile,nb_quantiles,income,gdpppp
0,ALB,2008,1,100,72889795,7297
1,ALB,2008,2,100,91666235,7297
2,ALB,2008,3,100,1010916,7297
3,ALB,2008,4,100,10869078,7297
4,ALB,2008,5,100,11326997,7297
...,...,...,...,...,...,...
11594,COD,2008,96,100,8106233,30319305
11595,COD,2008,97,100,9117834,30319305
11596,COD,2008,98,100,10578074,30319305
11597,COD,2008,99,100,12866029,30319305


In [3]:
# Ci-dessus, les revenues mondiale pour une période donnée

In [4]:
# Premère étape. Identifiquation des pays

In [5]:
isoUri = 'https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv'
codes = pd.read_csv(isoUri)
countryCodes = codes[['name','alpha-3', 'region', 'sub-region']]
countryCodes = countryCodes.rename(columns={"alpha-3":"iso"})
countryCodes

Unnamed: 0,name,iso,region,sub-region
0,Afghanistan,AFG,Asia,Southern Asia
1,Åland Islands,ALA,Europe,Northern Europe
2,Albania,ALB,Europe,Southern Europe
3,Algeria,DZA,Africa,Northern Africa
4,American Samoa,ASM,Oceania,Polynesia
...,...,...,...,...
244,Wallis and Futuna,WLF,Oceania,Polynesia
245,Western Sahara,ESH,Africa,Northern Africa
246,Yemen,YEM,Asia,Western Asia
247,Zambia,ZMB,Africa,Sub-Saharan Africa


In [6]:
merged_df = pd.merge(revenuMondial, countryCodes, left_on='country', right_on='iso', how='left')
incomes = merged_df[['name','country', 'year_survey', 'quantile','nb_quantiles','income','gdpppp']]
incomes = incomes.rename(columns={"name":"nation"})
incomes = incomes.rename(columns={"year_survey":"year"})
incomesStatus = incomes.copy()
incomesStatus.replace(',', '.', regex=True, inplace=True)
incomesStatus['income'] = incomesStatus['income'].astype(float)
incomesStatus['gdpppp'] = incomesStatus['gdpppp'].astype(float)
incomesStatus['year'] = incomesStatus['year'].astype(int)
incomesStatus

Unnamed: 0,nation,country,year,quantile,nb_quantiles,income,gdpppp
0,Albania,ALB,2008,1,100,728.89795,7297.00000
1,Albania,ALB,2008,2,100,916.66235,7297.00000
2,Albania,ALB,2008,3,100,1010.91600,7297.00000
3,Albania,ALB,2008,4,100,1086.90780,7297.00000
4,Albania,ALB,2008,5,100,1132.69970,7297.00000
...,...,...,...,...,...,...,...
11594,Congo. Democratic Republic of the,COD,2008,96,100,810.62330,303.19305
11595,Congo. Democratic Republic of the,COD,2008,97,100,911.78340,303.19305
11596,Congo. Democratic Republic of the,COD,2008,98,100,1057.80740,303.19305
11597,Congo. Democratic Republic of the,COD,2008,99,100,1286.60290,303.19305


In [7]:
# Information dataFrame

In [8]:
incomesStatus["year"] = incomesStatus["year"].astype(int)

# Résumé des dates minimales et maximales
date_min = incomesStatus['year'].min()
date_max = incomesStatus['year'].max()

# Résumé des revenus minimales et maximales
incomeMin = incomesStatus['income'].min()
incomeMax = incomesStatus['income'].max()
rowMinIncome = incomesStatus.loc[incomesStatus['income'] == incomeMin]
rowMaxIncome = incomesStatus.loc[incomesStatus['income'] == incomeMax]

# Récupérer le pays associé aux revenus minimaux et maximaux
countryMinIncome = rowMinIncome['nation'].values[0]  # Prendre la première valeur car il pourrait y avoir plusieurs lignes
countryMaxIncome = rowMaxIncome['nation'].values[0]

# Affichage des résultats
print(f"Pays avec le revenu minimal : {countryMinIncome} {incomeMin}")
print(f"Pays avec le revenu maximal : {countryMaxIncome} {incomeMax}")

# Affichage des résultats
print(f"Date minimale : {date_min}")
print(f"Date maximale : {date_max}")


Pays avec le revenu minimal : China 16.719418
Pays avec le revenu maximal : United States of America 176928.55
Date minimale : 2004
Date maximale : 2011


In [17]:
incomesStatus['year'].sort_values(ascending=True).unique()

array([2004, 2006, 2007, 2008, 2009, 2010, 2011])

In [9]:
nombrePays = incomesStatus['country'].nunique()
print(f"Nombre de pays par année: {nombrePays}")

Nombre de pays par année: 116


Traitement des donnée manquante

In [10]:
# Grouper par pays
groupedByCountry = incomesStatus.groupby('country')
# Liste des quantiles attendus (1 à 100)
expectedQuantiles = set(range(1, 101))
# Créer une liste pour stocker les pays avec des quantiles manquants
countriesWithMissingQuantiles = []

# Parcourir chaque groupe (pays) et vérifier les quantiles manquants
for country, group in groupedByCountry:
    actualQuantiles = set(group['quantile'])
    missingQuantiles = expectedQuantiles - actualQuantiles
    
    if missingQuantiles:
        countriesWithMissingQuantiles.append((country, missingQuantiles))

# Afficher les résultats
if countriesWithMissingQuantiles:
    print("Pays avec des quantiles manquants :")
    for country, missingQuantiles in countriesWithMissingQuantiles:
        print(f"{country}: Quantiles manquants {missingQuantiles}")
else:
    print("Aucun pays avec des quantiles manquants.")

Pays avec des quantiles manquants :
LTU: Quantiles manquants {41}


In [11]:
LTU = incomesStatus[incomesStatus['country'] == 'LTU']
LTU

Unnamed: 0,nation,country,year,quantile,nb_quantiles,income,gdpppp
6200,Lithuania,LTU,2008,1,100,566.3453,17571.0
6201,Lithuania,LTU,2008,2,100,1147.0587,17571.0
6202,Lithuania,LTU,2008,3,100,1490.8408,17571.0
6203,Lithuania,LTU,2008,4,100,1746.5715,17571.0
6204,Lithuania,LTU,2008,5,100,1842.9542,17571.0
...,...,...,...,...,...,...,...
6294,Lithuania,LTU,2008,96,100,14597.0360,17571.0
6295,Lithuania,LTU,2008,97,100,15659.0080,17571.0
6296,Lithuania,LTU,2008,98,100,17883.8770,17571.0
6297,Lithuania,LTU,2008,99,100,22551.1700,17571.0


In [13]:
ltu = 'LTU'
moyenneLTUIncome = incomesStatus.loc[(incomesStatus['country'] == ltu) & (incomesStatus['quantile'] == 40) 
            | (incomesStatus['country'] == ltu) & (incomesStatus['quantile'] == 42)]['income'].mean();
moyenneLTUIncome

4882.14065

In [14]:
dfMissingRowLTU = pd.DataFrame({'nation':'Lithuania','country': [ltu], 'year': [2008], 'quantile': [41], 
                           'nb_quantiles': [100], 'income': [moyenneLTUIncome], 'gdpppp': 17571.0});

dfMissingRowLTU

Unnamed: 0,nation,country,year,quantile,nb_quantiles,income,gdpppp
0,Lithuania,LTU,2008,41,100,4882.14065,17571.0


In [15]:
incomesStatus = pd.concat([incomesStatus, dfMissingRowLTU], ignore_index=True, sort=True)
display(incomesStatus.shape)
incomesStatus.head()

(11600, 7)

Unnamed: 0,country,gdpppp,income,nation,nb_quantiles,quantile,year
0,ALB,7297.0,728.89795,Albania,100,1,2008
1,ALB,7297.0,916.66235,Albania,100,2,2008
2,ALB,7297.0,1010.916,Albania,100,3,2008
3,ALB,7297.0,1086.9078,Albania,100,4,2008
4,ALB,7297.0,1132.6997,Albania,100,5,2008


In [16]:
incomesStatus[incomesStatus['country'] == 'LTU']

Unnamed: 0,country,gdpppp,income,nation,nb_quantiles,quantile,year
6200,LTU,17571.0,566.34530,Lithuania,100,1,2008
6201,LTU,17571.0,1147.05870,Lithuania,100,2,2008
6202,LTU,17571.0,1490.84080,Lithuania,100,3,2008
6203,LTU,17571.0,1746.57150,Lithuania,100,4,2008
6204,LTU,17571.0,1842.95420,Lithuania,100,5,2008
...,...,...,...,...,...,...,...
6295,LTU,17571.0,15659.00800,Lithuania,100,97,2008
6296,LTU,17571.0,17883.87700,Lithuania,100,98,2008
6297,LTU,17571.0,22551.17000,Lithuania,100,99,2008
6298,LTU,17571.0,38836.53000,Lithuania,100,100,2008


In [18]:
print(incomesStatus.loc[incomesStatus['gdpppp'].isnull(),:])

      country  gdpppp      income               nation  nb_quantiles  \
5800      XKX     NaN   437.89370                  NaN           100   
5801      XKX     NaN   508.17133                  NaN           100   
5802      XKX     NaN   591.82820                  NaN           100   
5803      XKX     NaN   668.00000                  NaN           100   
5804      XKX     NaN   730.40220                  NaN           100   
...       ...     ...         ...                  ...           ...   
11294     PSE     NaN  2763.88480  Palestine. State of           100   
11295     PSE     NaN  3077.83330  Palestine. State of           100   
11296     PSE     NaN  3449.22240  Palestine. State of           100   
11297     PSE     NaN  4165.99700  Palestine. State of           100   
11298     PSE     NaN  6343.87550  Palestine. State of           100   

       quantile  year  
5800          1  2008  
5801          2  2008  
5802          3  2008  
5803          4  2008  
5804          5

In [24]:
pse = 'PSE'
xkx = 'XKX'

In [19]:
PSE = incomesStatus[incomesStatus['country'] == 'PSE']
PSE

Unnamed: 0,country,gdpppp,income,nation,nb_quantiles,quantile,year
11199,PSE,,195.28990,Palestine. State of,100,1,2009
11200,PSE,,264.36533,Palestine. State of,100,2,2009
11201,PSE,,301.44672,Palestine. State of,100,3,2009
11202,PSE,,329.83392,Palestine. State of,100,4,2009
11203,PSE,,348.76495,Palestine. State of,100,5,2009
...,...,...,...,...,...,...,...
11294,PSE,,2763.88480,Palestine. State of,100,96,2009
11295,PSE,,3077.83330,Palestine. State of,100,97,2009
11296,PSE,,3449.22240,Palestine. State of,100,98,2009
11297,PSE,,4165.99700,Palestine. State of,100,99,2009


In [20]:
XKX = incomesStatus[incomesStatus['country'] == 'XKX']
XKX

Unnamed: 0,country,gdpppp,income,nation,nb_quantiles,quantile,year
5800,XKX,,437.89370,,100,1,2008
5801,XKX,,508.17133,,100,2,2008
5802,XKX,,591.82820,,100,3,2008
5803,XKX,,668.00000,,100,4,2008
5804,XKX,,730.40220,,100,5,2008
...,...,...,...,...,...,...,...
5895,XKX,,5155.36470,,100,96,2008
5896,XKX,,5689.52930,,100,97,2008
5897,XKX,,6233.73930,,100,98,2008
5898,XKX,,7366.67700,,100,99,2008


Je récupére le dataset de la worlbank afin de récuperer les gd pppp 


In [26]:
gdpppp = pd.read_csv("../data/worldbank-gdpppp.csv", parse_dates=True, sep=';')
copyGd = gdpppp.copy
gdpppp

Unnamed: 0,Country Name,Country Code,Indicator Name,1960,1961,1962,1963,1964,1965,1966,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Unnamed: 66
0,Aruba,ABW,GDP per capita (constant LCU),,,,,,,,...,46386.672241,45804.246592,46317.795605,49316.830489,50242.212175,48863.866312,37095.214575,47369.425924,52368.693789,
1,Africa Eastern and Southern,AFE,GDP per capita (constant LCU),,,,,,,,...,,,,,,,,,,
2,Afghanistan,AFG,GDP per capita (constant LCU),,,,,,,,...,38847.426029,38200.066145,38067.969362,37971.431359,37330.254657,37678.483737,35657.158957,27467.799942,,
3,Africa Western and Central,AFW,GDP per capita (constant LCU),,,,,,,,...,,,,,,,,,,
4,Angola,AGO,GDP per capita (constant LCU),,,,,,,,...,59302.603031,57735.249863,54264.269701,52294.099553,49848.477305,47846.007064,43696.594575,42842.501809,42801.049662,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,Kosovo,XKX,GDP per capita (constant LCU),,,,,,,,...,2532.382836,2719.065761,2887.746831,3004.373544,3096.207170,3258.368018,3082.199861,3421.228675,3648.909798,
262,"Yemen, Rep.",YEM,GDP per capita (constant LCU),,,,,,,,...,13139.014541,9207.589974,8128.458740,7520.847253,7391.358212,,,,,
263,South Africa,ZAF,GDP per capita (constant LCU),50671.710277,51167.241062,52733.930548,54930.561464,57504.188684,59167.327958,59911.860526,...,79721.430691,79117.199555,78872.591986,79477.498714,79731.644247,78910.302314,73302.471622,75987.078108,76790.166294,
264,Zambia,ZMB,GDP per capita (constant LCU),7063.137721,6936.871069,6552.465091,6553.995782,7123.281483,8047.315901,7358.359443,...,7717.537014,7693.391834,7736.572581,7762.202615,7831.856543,7709.347755,7277.953234,7515.175042,7694.522329,


In [25]:
gdpppp[gdpppp['Country Code'] == pse]

Unnamed: 0,Country Name,Country Code,Indicator Name,1960,1961,1962,1963,1964,1965,1966,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Unnamed: 66
196,West Bank and Gaza,PSE,GDP per capita (constant LCU),,,,,,,,...,3227.849345,3272.154324,3483.099035,3462.979861,3417.794408,3378.434621,2922.468011,3051.48607,3095.499812,


In [30]:
copyGd = copyGd.drop(columns=['Unnamed: 66'])

# Utilisez la fonction melt pour regrouper les colonnes de dates en une seule colonne
df_melted = pd.melt(copyGd, id_vars=['Country Name', 'Country Code', 'Indicator Name'], var_name='Date', value_name='gdpppp')

# Supprimez les lignes avec des valeurs NaN dans la colonne 'gdpppp'
df_melted = df_melted.dropna(subset=['gdpppp'])
df_melted

AttributeError: 'function' object has no attribute 'drop'