# Ethnicity and Religion

In [76]:
# Import packages
import pandas as pd
import numpy as np

In [77]:
# Open all datasets
epr = pd.read_csv('/dataset/ethnicity/EPR-2021.csv')
er =pd.read_csv('/dataset/ethnicity/ER-2021.csv')

# Check features and remove the features of each dataset that are not needed
def check_features(df, name="Dataset"):
    print("---------------------------------- \n")
    print(f"Dataset: {name} \n")
    print(df.columns)
    print(df.shape)
    print(df.info())
    return

check_features(epr, "epr")
check_features(er, "er")

---------------------------------- 

Dataset: epr 

Index(['gwid', 'statename', 'from', 'to', 'group', 'groupid', 'gwgroupid',
       'umbrella', 'size', 'status', 'reg_aut'],
      dtype='object')
(4339, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4339 entries, 0 to 4338
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   gwid       4339 non-null   int64  
 1   statename  4339 non-null   object 
 2   from       4339 non-null   int64  
 3   to         4339 non-null   int64  
 4   group      4339 non-null   object 
 5   groupid    4339 non-null   int64  
 6   gwgroupid  4339 non-null   int64  
 7   umbrella   306 non-null    float64
 8   size       4339 non-null   float64
 9   status     4339 non-null   object 
 10  reg_aut    3813 non-null   object 
dtypes: float64(2), int64(5), object(4)
memory usage: 373.0+ KB
None
---------------------------------- 

Dataset: er 

Index(['year', 'coa', 'coo', 'totalrefugees'

In [78]:
### EPR dataset ###
'''
 0   gwid       4339 non-null   int64  
 1   statename  4339 non-null   object 
 2   from       4339 non-null   int64  
 3   to         4339 non-null   int64  
 4   group      4339 non-null   object 
 5   groupid    4339 non-null   int64  
 6   gwgroupid  4339 non-null   int64  
 7   umbrella   306 non-null    float64
 8   size       4339 non-null   float64
 9   status     4339 non-null   object 
 10  reg_aut    3813 non-null   object 
 '''

# Delete gwid, groupid, gwgroupid, umbrella, reg_aut, status
epr.drop(['gwid', 'groupid', 'gwgroupid', 'umbrella', 'reg_aut', 'status'], axis=1, inplace=True)

# change 'statename' to 'country'
epr.rename(columns={"statename": "country"}, inplace=True)
epr.head(10)

Unnamed: 0,country,from,to,group,size
0,United States of America,1946,1965,Whites,0.691
1,United States of America,1946,1965,African Americans,0.124
2,United States of America,1946,1965,American Indians,0.0078
3,United States of America,1966,2008,Whites,0.691
4,United States of America,1966,2008,Latinos,0.125
5,United States of America,1966,2008,African Americans,0.124
6,United States of America,1966,2008,Asian Americans,0.036
7,United States of America,1966,2008,American Indians,0.0078
8,United States of America,1966,2008,Arab Americans,0.0042
9,United States of America,2009,2014,Whites,0.66


In [79]:
'''
# check the proportion of 'status' and unique values
print(epr["status"].value_counts(normalize=True) * 100)
print(epr["status"].nunique())

# only keep the rows with the 'status' == 'DOMINANT' or 'MONOPOLY' as the others does not have the political power or does not have proper government
epr = epr[epr["status"].isin(['DOMINANT', 'MONOPOLY'])]
epr.head(10)
'''

'\n# check the proportion of \'status\' and unique values\nprint(epr["status"].value_counts(normalize=True) * 100)\nprint(epr["status"].nunique())\n\n# only keep the rows with the \'status\' == \'DOMINANT\' or \'MONOPOLY\' as the others does not have the political power or does not have proper government\nepr = epr[epr["status"].isin([\'DOMINANT\', \'MONOPOLY\'])]\nepr.head(10)\n'

In [80]:
# modify the timeframe of epr dataset
expanded_rows = []

# expand to individual years for the period 'from' to 'to' for each row
for _, row in epr.iterrows():
    years = list(range(row["from"], row["to"] + 1))
    expanded_rows.extend([
        {
            "country": row["country"],
            "year": year,
            "group": row["group"],
            "size": row["size"]
        }
        for year in years
    ])

# expanded dataframe
expanded_epr = pd.DataFrame(expanded_rows)

# display
print(expanded_epr.head(30))

                     country  year              group   size
0   United States of America  1946             Whites  0.691
1   United States of America  1947             Whites  0.691
2   United States of America  1948             Whites  0.691
3   United States of America  1949             Whites  0.691
4   United States of America  1950             Whites  0.691
5   United States of America  1951             Whites  0.691
6   United States of America  1952             Whites  0.691
7   United States of America  1953             Whites  0.691
8   United States of America  1954             Whites  0.691
9   United States of America  1955             Whites  0.691
10  United States of America  1956             Whites  0.691
11  United States of America  1957             Whites  0.691
12  United States of America  1958             Whites  0.691
13  United States of America  1959             Whites  0.691
14  United States of America  1960             Whites  0.691
15  United States of Ame

In [81]:
# check for nan values in expanded_epr
print(expanded_epr.isna().sum())

country    0
year       0
group      0
size       0
dtype: int64


In [82]:
################# FUNCTIONS ################
# filter out the countries - OECD
# function that filters out the countries
# this will be further applied to other datasets as well
def filter_countries(df, country_column="country"):
    if country_column not in df.columns:
        raise ValueError(f"Column '{country_column}' not found in DataFrame")
    
    oecd_countries = [
    'Australia', 'Austria', 'Belgium', 'Canada', 'Chile', 'Colombia', 'Costa Rica',
    'Czechia', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece',
    'Hungary', 'Iceland', 'Ireland', 'Israel', 'Italy', 'Japan', 'Korea', 'Latvia',
    'Lithuania', 'Luxembourg', 'Mexico', 'Netherlands', 'New Zealand', 'Norway',
    'Poland', 'Portugal', 'Slovenia', 'Spain', 'Sweden', 'Switzerland', 
    'Turkey', 'United Kingdom', 'United States'
    ] # 37

    df_filtered = df[df[country_column].astype(str).apply(
        lambda x: any(country.lower() in x.lower() for country in oecd_countries)
    )]

    missing_countries = set(oecd_countries) - set(df_filtered[country_column])

    if missing_countries:
        missing_data = pd.DataFrame({
            country_column: list(missing_countries)
        })
        for col in df.columns:
            if col != country_column:
                if pd.api.types.is_numeric_dtype(df[col]):
                    missing_data[col] = 0
                else:
                    missing_data[col] = "N/A"
        df_filtered = pd.concat([df_filtered, missing_data], ignore_index=True)
    
    return df_filtered

# function that filters out the years
# this will be further applied to other datasets as well
def filter_fill_years(df, start_year=1990, end_year=2021, year_column="year", country_column="country"):
    if year_column not in df.columns or country_column not in df.columns:
        raise ValueError("The dataset must contain 'year' and 'country' columns.")

    df_filtered = df.drop_duplicates(subset=[country_column, year_column])
    df_filtered = df[(df[year_column] >= start_year) & (df[year_column] <= end_year)].copy()
    df_filtered = df_filtered.drop_duplicates(subset=[country_column, year_column])
    
    oecd_countries = ['Australia', 'Austria', 'Belgium', 'Canada', 'Chile', 'Colombia', 'Costa Rica', 
    'Czechia', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 
    'Ireland', 'Israel', 'Italy', 'Japan', 'Korea', 'Latvia', 'Lithuania', 'Luxembourg', 'Mexico', 'Netherlands', 
    'New Zealand', 'Norway', 'Poland', 'Portugal', 'Slovenia', 'Spain', 'Sweden', 'Switzerland', 'Turkey', 
    'United Kingdom', 'United States']

    all_countries = set(oecd_countries)
    all_years = list(range(start_year, end_year + 1))
    multi_index = pd.MultiIndex.from_product([all_countries, all_years], names=[country_column, year_column])
    
    df_filtered = df_filtered.set_index([country_column, year_column]).reindex(multi_index).reset_index()
    
    for col in df.columns:
        if col not in [year_column, country_column]:
            if pd.api.types.is_numeric_dtype(df[col]):
                df_filtered[col] = df_filtered[col].fillna(0)
            else:
                df_filtered[col] = df_filtered[col].fillna("N/A")
    
    return df_filtered


# check the year range of each country and unique country names in each dataset
def check_year_country(df):
    if "country" not in df.columns or "year" not in df.columns:
        raise ValueError("The dataset must contain 'country' and 'year' columns.")

    year_summary = df.groupby("country")["year"].agg(["min", "max", "count"]).reset_index()
    year_summary.columns = ["country", "min_year", "max_year", "year_count"]

    unique_countries = df["country"].unique()
    unique_country_count = len(unique_countries)

    return year_summary, unique_countries, unique_country_count

In [83]:
### ER dataset ###
'''
 0   year                    4862 non-null   int64   // year 
 1   coa                     4862 non-null   object  // country of asylum
 2   coo                     4862 non-null   object  // country of origin
 3   totalrefugees           4858 non-null   float64 // total number of refugees
 4   ccode_coo               4862 non-null   int64   // country code of origin
 5   ccode_coa               4862 non-null   int64   // country code of asylum
 6   minimal_distance        4734 non-null   float64 // minimal distance between coa and coo in km
 7   groupname1              4680 non-null   object  // name of the first ethnic refugee group
 8   gwgroupid1              4219 non-null   float64 // id of the first ethnic refugee group
 9   groupname2              2488 non-null   object  // name of the second ethnic refugee group
 10  gwgroupid2              2270 non-null   float64 // id of the second ethnic refugee group
 11  groupname3              986 non-null    object  // name of the third ethnic refugee group
 12  gwgroupid3              819 non-null    float64 // id of the third ethnic refugee group
 13  sources                 2783 non-null   object  // reference
 14  quote                   2015 non-null   object  // quote or reference phrase for the coding decision
 15  groupshare1_num         4431 non-null   object  // categrical size of the first ethnic refugee group
 16  groupshare2_num         2489 non-null   object  // categrical size of the second ethnic refugee group
 17  groupshare3_num         949 non-null    object  // categrical size of the third ethnic refugee group
 18  group1share_multiplier  4416 non-null   float64 // estimated share of the first ethnic refugee group
 19  group2share_multiplier  2477 non-null   float64 // estimated share of the second ethnic refugee group
 20  group3share_multiplier  942 non-null    float64 // estimated share of the third ethnic refugee group
 21  group1_size             4416 non-null   float64 // estimated number of refugees belonging to the first ethnic refugee group
 22  group2_size             2477 non-null   float64 // estimated number of migrants belonging to the second ethnic refugee group
 23  group3_size             942 non-null    float64 // estimated number of migrants belonging to the third ethnic refugee group
'''

# check the total number of records
print(er.shape[0])

# drop the unnecessary columns first: ccode_coo, ccode_coa, gwgroupid1, gwgroupid2, gwgroupid3, sources, quote, minimal_distance
er.drop(['ccode_coo', 'ccode_coa', 'gwgroupid1', 'gwgroupid2', 'gwgroupid3', 'sources', 'quote', 'minimal_distance'], axis=1, inplace=True)
er.drop(columns=['groupname1','groupname2','groupname3','groupshare1_num','groupshare2_num','groupshare3_num','group1share_multiplier','group2share_multiplier','group3share_multiplier'], inplace=True)

# change 'coo' to 'country'
er.rename(columns={"coo": "country"}, inplace=True)
# change 'coa' to 'country of asylum'
er.rename(columns={"coa": "country of asylum"}, inplace=True)
er.drop(columns=['country'], inplace=True)
er.rename(columns={'country of asylum':'country'}, inplace=True)

# check the proportion of nan values of each column
print(er.isna().sum() / er.shape[0] * 100)
'''
after looking at the original data, missing values are not just missing values 
it is either; they are missing cuz they are meant to be 0 for numerical values
or they are missing cuz they do not exist; e.g. only group 1 exists cuz there was no group 2,3 of refugees.
therefore, rather than dropping the groups that has 50-80% of missing values, we will fill them with 0 if numerical, or N/A if categorical.
also, we will add the extra column 'total size' that is the sum of the sizes of all groups
'''
# fill the missing values with 0 if numerical, or N/A if categorical
er = er.apply(lambda x: x.fillna(0) if x.dtype.kind in 'buif' else x.fillna('N/A'))

# drop the group size columns
er.drop(columns=['group1_size','group2_size','group3_size'], inplace=True)

print(er.info())
print(er.head(5))

4862
year              0.000000
country           0.000000
totalrefugees     0.082271
group1_size       9.173180
group2_size      49.053887
group3_size      80.625257
dtype: float64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4862 entries, 0 to 4861
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   year           4862 non-null   int64  
 1   country        4862 non-null   object 
 2   totalrefugees  4862 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 114.1+ KB
None
   year        country  totalrefugees
0  1988  United States         3863.0
1  1989  United States         7668.0
2  1990  United States        12450.0
3  1991  United States        16317.0
4  1992  United States        19934.0


In [84]:
# check er dataset again
check_features(er, "er")

---------------------------------- 

Dataset: er 

Index(['year', 'country', 'totalrefugees'], dtype='object')
(4862, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4862 entries, 0 to 4861
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   year           4862 non-null   int64  
 1   country        4862 non-null   object 
 2   totalrefugees  4862 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 114.1+ KB
None


In [85]:
# change er['country']='United States of America' to 'United States'
er = er.replace({'United States of America': 'United States'})

In [86]:
# change expanded_epr['country']='Korea, Republic of' to 'Korea' and drop 'Korea, People's Republic of' from 'country'
expanded_epr = expanded_epr.drop(expanded_epr[expanded_epr['country'] == 'Korea, People\'s Republic of'].index)
expanded_epr = expanded_epr.replace({'Korea, Republic of': 'Korea'})
expanded_epr = expanded_epr.replace({'Italy/Sardinia': 'Italy'})
expanded_epr = expanded_epr.replace({'Turkey (Ottoman Empire)': 'Turkey'})
expanded_epr = expanded_epr.replace({'United States of America': 'United States'})

In [87]:
# manipulate the country names and range of each dataset
print(sorted(er["country"].unique()))
print("\n----------------------\n")
print(sorted(expanded_epr["country"].unique()))
print("\n----------------------\n")

# only filter out the oecd countries including variations
er_oecd = filter_countries(er, country_column="country")
expanded_epr_oecd = filter_countries(expanded_epr, country_column="country")
print("after filtering: ")
print(sorted(er_oecd["country"].unique()))
print("\n----------------------\n")
print(len(er_oecd["country"].unique()))
print("\n----------------------\n")
print(sorted(expanded_epr_oecd["country"].unique()))
print("\n----------------------\n")
print(len(expanded_epr_oecd["country"].unique()))


['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina', 'Armenia', 'Austria', 'Azerbaijan', 'Bangladesh', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Burkina Faso', 'Burundi', 'Cameroon', 'Canada', 'Central African Rep.', 'Central African Republic', 'Chad', 'China', 'Congo', 'Costa Rica', 'Croatia', 'Cuba', 'Cyprus', 'Czech Rep.', "CÌ«te d'Ivoire", 'Dem. Rep. of the Congo', 'Democratic Republic of Congo', 'Djibouti', 'Dominican Rep.', 'Ecuador', 'Egypt', 'Eritrea', 'Ethiopia', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Honduras', 'Hungary', 'India', 'Indonesia', 'Iran', 'Iraq', 'Islamic Rep. of Iran', 'Israel', 'Italy', 'Ivory Coast', 'Jordan', 'Kazakhstan', 'Kenya', 'Kuwait', 'Kyrgyzstan', "Lao People's Dem. Rep.", 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Libyan Arab Jamahiriya', 'Malawi', 'Malaysia', 'Mali', 'Malta', 'Mauritania', 'Mexico', 'Montenegro', 'Mozambiqu

In [88]:
# now filter the year range
er_final = filter_fill_years(er_oecd, start_year=1990, end_year=2021)
epr_final = filter_fill_years(expanded_epr_oecd, start_year=1990, end_year=2021)

In [89]:
# check again the number of years of each unique country has in each dataset
print(check_year_country(er_final))
print("\n----------------------\n")
print(check_year_country(epr_final))

(           country  min_year  max_year  year_count
0        Australia      1990      2021          32
1          Austria      1990      2021          32
2          Belgium      1990      2021          32
3           Canada      1990      2021          32
4            Chile      1990      2021          32
5         Colombia      1990      2021          32
6       Costa Rica      1990      2021          32
7          Czechia      1990      2021          32
8          Denmark      1990      2021          32
9          Estonia      1990      2021          32
10         Finland      1990      2021          32
11          France      1990      2021          32
12         Germany      1990      2021          32
13          Greece      1990      2021          32
14         Hungary      1990      2021          32
15         Iceland      1990      2021          32
16         Ireland      1990      2021          32
17          Israel      1990      2021          32
18           Italy      1990  

In [91]:
# check again the number of unique countries in each dataset without using the function
print(len(er_final["country"].unique()))
print("\n----------------------\n")
print(len(epr_final["country"].unique()))

print(er_final.info())
print("\n----------------------\n")
print(epr_final.info())

37

----------------------

37
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1184 entries, 0 to 1183
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   country        1184 non-null   object 
 1   year           1184 non-null   int64  
 2   totalrefugees  1184 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 27.9+ KB
None

----------------------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1184 entries, 0 to 1183
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   country  1184 non-null   object 
 1   year     1184 non-null   int64  
 2   group    1184 non-null   object 
 3   size     1184 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 37.1+ KB
None


In [93]:
# save the cleaned datasets
def save_cleaned_datasets(df, filename="cleaned_data.csv", file_format="csv", save_path="/work/Yeni/ethnic/"):
    if file_format not in ["csv"]:
        raise ValueError("The file format must be 'csv'.")
    
    os.makedirs(save_path, exist_ok=True)
    file_path = os.path.join(save_path, filename)
    if file_format == "csv":
        df.to_csv(file_path, index=False)
    elif file_format == "excel":
        df.to_excel(file_path, index=False, engine="openpyxl")
    
    print(f"Saved the cleaned dataset to {file_path}.")


save_cleaned_datasets(er_final, filename="er_cleaned.csv")
save_cleaned_datasets(epr_final, filename="epr_cleaned.csv")

Saved the cleaned dataset to /work/Yeni/ethnic/yeni_er.csv.
Saved the cleaned dataset to /work/Yeni/ethnic/yeni_epr.csv.
