In [None]:
# importing libraries
import pandas as pd
import matplotlib.pyplot as plt

### Simpson's index

Formula: <br>
1 - (sum_n * (sum_n - 1) / N * (N - 1))

<br>
<br>
sum_n = Sum number of individuals of each species <br>
N = total number of individuals of all species

# Occupation

In [None]:
# importing dataset
occupation_dataset = pd.read_csv('./data/cleaned/occupation_cleaned.csv')
occupation_dataset.head()

In [None]:
# dataset columns
occupation_dataset.columns

In [None]:
# Calculation of number of individuals for each occupation per state
occupation_dataset = occupation_dataset[
    ['State', 'Occupation classification', 'Main/Marginal', 'Persons']
].groupby(['State', 'Occupation classification']).sum()
occupation_dataset.head()

In [None]:
# Calculation of n - 1 == Persons - 1
occupation_dataset['n-1'] = occupation_dataset['Persons'] - 1
occupation_dataset.head()

In [None]:
# Calculation of n (n - 1) == Persons (Persons - 1)
occupation_dataset['n(n-1)'] = \
occupation_dataset['Persons'] * occupation_dataset['n-1']

occupation_dataset.head()

In [None]:
# Converting groupby to dataframe
computations = pd.DataFrame(occupation_dataset['n(n-1)'].groupby(level=0).sum())
computations.rename({ "n(n-1)": "sum_n(n-1)"}, inplace=True, axis=1)
computations.reset_index(inplace=True)
computations.head()

In [None]:
# Sum of persons per state
computations['N'] = pd.DataFrame(occupation_dataset['Persons'] \
                                              .groupby(level=0).sum())['Persons'].values
computations.head()

In [None]:
# N (N - 1)
computations['N(N-1)'] = computations['N'] * (computations['N'] - 1)
computations.head()

In [None]:
# Calculation of simpson's index
computations['SDI'] = 1 - (computations['sum_n(n-1)'] / computations['N(N-1)'])
computations.head()

In [None]:
occupation_sdi = computations.copy()
occupation_sdi.head()

In [None]:
occupation_sdi.to_csv('occupation_sdi.csv', index=False)

# Migrations

In [None]:
# importing dataset
migration_dataset = pd.read_csv('./data/cleaned/migration_cleaned.csv')
migration_dataset.head()

In [None]:
# dataset columns
migration_dataset.columns

In [None]:
# setting index to multi index 
migration_dataset = migration_dataset.set_index(['Current state', 'Previous residence'])
migration_dataset

In [None]:
migration_dataset['n(n-1)'] = migration_dataset['Persons'] * (migration_dataset['Persons'] - 1)
migration_dataset.head()

In [None]:
# sum of sum_n(n - 1)
computations = pd.DataFrame(migration_dataset['n(n-1)'].groupby(level=0).sum())
computations.rename({ "n(n-1)": "sum_n(n-1)"}, inplace=True, axis=1)
computations.head()

In [None]:
# Total number of people per state
computations['N'] = migration_dataset['Persons'].groupby(level=0).sum()
computations.head()

In [None]:
# Computation of N(N-1)
computations['N(-1)'] = computations['N'] * (computations['N'] - 1)
computations.head()

In [None]:
# Calculation of simpson's index for migration
computations['SDI'] = 1 - (computations['sum_n(n-1)'] / computations['N(-1)'])
computations.head()

In [None]:
migration_sdi = computations.copy()
migration_sdi.head()

In [None]:
migration_sdi.to_csv('migration_sdi.csv')

# Mother tongue diversity 

In [None]:
# importing dataset
mtd_dataset = pd.read_csv('./data/cleaned/mother_tongue_diversity_cleaned.csv')
mtd_dataset.head()

In [None]:
# dataset columns
mtd_dataset.columns

In [None]:
# setting index to multi index 
mtd_dataset = mtd_dataset.set_index(['Area name', 'Mother tongue name'])
mtd_dataset.head()

In [None]:
mtd_dataset['n(n-1)'] = mtd_dataset['Persons'] * (mtd_dataset['Persons'] - 1)
mtd_dataset.head()

In [None]:
# sum of sum_n(n - 1)
computations = pd.DataFrame(mtd_dataset['n(n-1)'].groupby(level=0).sum())
computations.rename({ "n(n-1)": "sum_n(n-1)"}, inplace=True, axis=1)
computations.head()

In [None]:
# Total number of people per state
computations['N'] = mtd_dataset['Persons'].groupby(level=0).sum()
computations.head()

In [None]:
# Computation of N(N-1)
computations['N(-1)'] = computations['N'] * (computations['N'] - 1)
computations.head()

In [None]:
# Calculation of simpson's index for migration
computations['SDI'] = 1 - (computations['sum_n(n-1)'] / computations['N(-1)'])
computations.head()

In [None]:
mtd_sdi = computations.copy()
mtd_sdi.head()

In [None]:
mtd_sdi.to_csv('mother_tongue_diversity_sdi.csv')

# Religion

In [None]:
# importing dataset
religion_dataset = pd.read_csv('./data/cleaned/religion_cleaned.csv')
religion_dataset.head()

In [None]:
# dataset columns
religion_dataset.columns

In [None]:
# setting index to multi index 
religion_dataset = religion_dataset.set_index(['State', 'Religion'])
religion_dataset.head()

In [None]:
religion_dataset['n(n-1)'] = religion_dataset['Persons'] * (religion_dataset['Persons'] - 1)
religion_dataset.head()

In [None]:
# sum of sum_n(n - 1)
computations = pd.DataFrame(religion_dataset['n(n-1)'].groupby(level=0).sum())
computations.rename({ "n(n-1)": "sum_n(n-1)"}, inplace=True, axis=1)
computations.head()

In [None]:
# Total number of people per state
computations['N'] = religion_dataset['Persons'].groupby(level=0).sum()
computations.head()

In [None]:
# Computation of N(N-1)
computations['N(-1)'] = computations['N'] * (computations['N'] - 1)
computations.head()

In [None]:
# Calculation of simpson's index for migration
computations['SDI'] = 1 - (computations['sum_n(n-1)'] / computations['N(-1)'])
computations.head()

In [None]:
religion_sdi = computations.copy()
religion_sdi.head()

In [None]:
religion_sdi.to_csv('religion_sdi.csv')

In [None]:
religion_sdi.SDI.isna().value_counts()