# Country features

In this notebook the country of each customer is extracted from the "customer_countries" dataset.

- Markodwn Cell 1: Explore distribution of invoices over different countries with map
- Rest: clean countries and store new DF

In [6]:
import pandas as pd
import pycountry
import plotly.express as px

#load data
df_country = pd.read_csv('data/invoices_basic.csv')
countries = pd.read_csv("data/countries.csv")


# Store original country dataframe
original_df_country = df_country.copy()

# Merge the dataframes on the 'customer_id' column
df_country = pd.merge(df_country, countries[['customer_id', 'isocode']], on='customer_id', how='left')

# Map ISO codes to country names
def get_country_name(iso_code):
    country = pycountry.countries.get(alpha_2=iso_code)
    return country.name if country else "Unknown"

# Get the count of each unique value in the 'isocode' column
isocode_counts = df_country['isocode'].value_counts().reset_index()
isocode_counts.columns = ['isocode', 'count']


isocode_counts['country_name'] = isocode_counts['isocode'].apply(get_country_name)
isocode_counts["percentage_total"] = isocode_counts["count"] / isocode_counts["count"].sum() * 100


# Count distinct customer_id for each isocode
distinct_customers = df_country.groupby('isocode')['customer_id'].nunique().reset_index()
distinct_customers.columns = ['isocode', 'distinct_customers']

# Merge the distinct_customers with isocode_counts
isocode_counts = pd.merge(isocode_counts, distinct_customers, on='isocode', how='left')


# Create a choropleth map
fig = px.choropleth(
    isocode_counts,
    locations='country_name',
    locationmode='country names',
    color='count',
    hover_name='isocode',
    color_continuous_scale=px.colors.sequential.Plasma,
    title='Choropleth Map of Entries by Country'
)

# Show the plot
fig.show()


print(isocode_counts.head(30))

  isocode  count    country_name  percentage_total  distinct_customers
0      US      1   United States              25.0                   1
1      DE      1         Germany              25.0                   1
2      GB      1  United Kingdom              25.0                   1
3      RS      1          Serbia              25.0                   1


## Dropping countries with little invoices



In [7]:
# Filter out all countries with less then 100 invoices; comment out since dummy df has only 4 invoices
#isocode_counts = isocode_counts[isocode_counts['count'] > 100]


print("Invoices before: ", len(df_country))
subset_list = isocode_counts['isocode'].tolist()
df_country = df_country[df_country['isocode'].isin(subset_list)]

print("Invoices after: ", len(df_country))
print("Number of countries: ", len(df_country['isocode'].unique()))

Invoices before:  4
Invoices after:  4
Number of countries:  4


## Analyze payment differences between countries

In [10]:
# Encode country column for model

# Create encoded isocode column
labels, uniques = pd.factorize(df_country['isocode'])
df_country['country_enc'] = labels

#drop null values
df_country = df_country.dropna()

#save csv
df_country.to_csv('data/invoices_basic.csv', index=False)

df_country


Unnamed: 0,amount,currency,payment_term,customer_id,receipt_date,due_date,weighted_payment_date,Currency,Rate,amount_euro,isocode,country_enc
0,1000,USD,60,abc123,2023-01-01,2023-03-02,2023-03-20,USD,1.1155,896.458987,US,0
1,1500,EUR,15,acb345,2023-01-01,2023-01-16,2023-01-25,EUR,1.0,1500.0,DE,1
2,2000,GBP,30,xyz678,2023-01-01,2023-01-31,2023-01-25,GBP,0.83428,2397.276694,GB,2
3,2500,RSD,30,abc987,2023-01-01,2023-01-01,2023-01-01,RSD,117.0955,21.350095,RS,3
