In [1]:
# importing libraries
import pandas as pd
import math
import matplotlib.pyplot as plt

# Occupation

In [None]:
# importing dataset
occupation_dataset = pd.read_csv('./data/cleaned/occupation_cleaned.csv')
occupation_dataset.head()

In [None]:
# dataset columns
occupation_dataset.columns

In [None]:
# Calculation of number of individuals for each occupation per state
occupation_dataset = occupation_dataset[
    ['State', 'Occupation classification', 'Main/Marginal', 'Persons']
].groupby(['State', 'Occupation classification']).sum()
occupation_dataset.head()

In [None]:
# renaming persons to n
occupation_dataset.rename({'Persons': 'n'}, inplace=True, axis=1)

In [None]:
occupation_dataset

In [None]:
# Total persons per state
total_persons_pers_state = pd.DataFrame(occupation_dataset['n'].groupby(level=0).sum())
total_persons_pers_state.head()

In [None]:
# Occupation proportion
occupation_dataset['p'] = occupation_dataset.apply(
    lambda row: row.n / total_persons_pers_state.loc[row.name[0]], axis=1
)
occupation_dataset.head()

In [None]:
occupation_dataset['ln_p'] = occupation_dataset['p'].apply(lambda value: math.log(value))
occupation_dataset.head()

In [None]:
occupation_dataset['p_ln_p'] = occupation_dataset['p'] * occupation_dataset['ln_p']
occupation_dataset.head()

In [None]:
shannon_index = pd.DataFrame(occupation_dataset['p_ln_p'].groupby(level=0).sum() * -1)
shannon_index.reset_index(inplace=True)
occupation_si = shannon_index.copy()
occupation_si.rename({'State': 'state'}, inplace=True, axis=1)
occupation_si.head()

In [None]:
occupation_si.to_csv('occupation_si.csv', index=False)

# Migrations

In [None]:
# importing dataset
migration_dataset = pd.read_csv('./data/cleaned/migration_cleaned.csv')
migration_dataset.head()

In [None]:
# dataset columns
migration_dataset.columns

In [None]:
# setting index to multi index 
migration_dataset = migration_dataset.set_index(['Current state', 'Previous residence'])
# migration_dataset.rename({'Current state': 'state'}, inplace=True, axis=1)
migration_dataset.head()

In [None]:
migration_dataset.rename({'Persons': 'n'}, inplace=True, axis=1)
migration_dataset.head()

In [None]:
# Total persons per state
total_persons_pers_state = pd.DataFrame(migration_dataset['n'].groupby(level=0).sum())
total_persons_pers_state.head()

In [None]:
# Migration proportion
migration_dataset['p'] = migration_dataset.apply(
    lambda row: row.n / total_persons_pers_state.loc[row.name[0]], axis=1
)
migration_dataset.head()

In [None]:
migration_dataset['ln_p'] = migration_dataset['p'].apply(lambda value: math.log(value) if value != 0 else 0)
migration_dataset.head()

In [None]:
migration_dataset['p_ln_p'] = migration_dataset['p'] * migration_dataset['ln_p']
migration_dataset.head()

In [None]:
shannon_index = pd.DataFrame(migration_dataset['p_ln_p'].groupby(level=0).sum() * -1)
shannon_index.reset_index(inplace=True)
migration_si = shannon_index.copy()
migration_si.rename({'Current state': 'state'}, inplace=True, axis=1)
migration_si.head()

In [None]:
migration_si.to_csv('migration_si.csv', index=False)

# Mother tongue diversity 

In [None]:
# importing dataset
mtd_dataset = pd.read_csv('./data/cleaned/mother_tongue_diversity_cleaned.csv')
mtd_dataset.head()

In [None]:
# dataset columns
mtd_dataset.columns

In [None]:
# setting index to multi index 
mtd_dataset = mtd_dataset.set_index(['Area name', 'Mother tongue name'])
mtd_dataset.head()

In [None]:
# renaming persons to n
mtd_dataset.rename({'Persons': 'n'}, inplace=True, axis=1)
mtd_dataset.head()

In [None]:
# Total persons per state
total_persons_pers_state = pd.DataFrame(mtd_dataset['n'].groupby(level=0).sum())
total_persons_pers_state.head()

In [None]:
# Mother tongue proportion
mtd_dataset['p'] = mtd_dataset.apply(
    lambda row: row.n / total_persons_pers_state.loc[row.name[0]], axis=1
)
mtd_dataset.head()

In [None]:
mtd_dataset['ln_p'] = mtd_dataset['p'].apply(lambda value: math.log(value))
mtd_dataset.head()

In [None]:
mtd_dataset['p_ln_p'] = mtd_dataset['p'] * mtd_dataset['ln_p']
mtd_dataset.head()

In [None]:
shannon_index = pd.DataFrame(mtd_dataset['p_ln_p'].groupby(level=0).sum() * -1)
shannon_index.reset_index(inplace=True)
mtd_si = shannon_index.copy()
mtd_si.rename({'State': 'state'}, inplace=True, axis=1)
mtd_si.head()

In [None]:
mtd_si.to_csv('mother_tongue_diversity_si.csv', index=False)

# Religion

In [2]:
# importing dataset
religion_dataset = pd.read_csv('./data/cleaned/religion_cleaned.csv')
religion_dataset.head()

Unnamed: 0,State,Religion,Persons
0,JAMMU & KASHMIR,Hindu,3566674.0
1,JAMMU & KASHMIR,Islam/Muslim,8567485.0
2,JAMMU & KASHMIR,Christian,35631.0
3,JAMMU & KASHMIR,Sikh,234848.0
4,JAMMU & KASHMIR,Buddhist,112584.0


In [3]:
# dataset columns
religion_dataset.columns

Index(['State', 'Religion', 'Persons'], dtype='object')

In [4]:
# setting index to multi index 
religion_dataset = religion_dataset.set_index(['State', 'Religion'])
religion_dataset.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Persons
State,Religion,Unnamed: 2_level_1
JAMMU & KASHMIR,Hindu,3566674.0
JAMMU & KASHMIR,Islam/Muslim,8567485.0
JAMMU & KASHMIR,Christian,35631.0
JAMMU & KASHMIR,Sikh,234848.0
JAMMU & KASHMIR,Buddhist,112584.0


In [5]:
# renaming persons to n
religion_dataset.rename({'Persons': 'n'}, inplace=True, axis=1)
religion_dataset.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n
State,Religion,Unnamed: 2_level_1
JAMMU & KASHMIR,Hindu,3566674.0
JAMMU & KASHMIR,Islam/Muslim,8567485.0
JAMMU & KASHMIR,Christian,35631.0
JAMMU & KASHMIR,Sikh,234848.0
JAMMU & KASHMIR,Buddhist,112584.0


In [6]:
# Total persons per state
total_persons_pers_state = pd.DataFrame(religion_dataset['n'].groupby(level=0).sum())
total_persons_pers_state.head()

Unnamed: 0_level_0,n
State,Unnamed: 1_level_1
ANDAMAN & NICOBAR ISLANDS,379348.0
ANDHRA PRADESH,84167130.0
ARUNACHAL PRADESH,1014526.0
ASSAM,31127585.0
BIHAR,103833888.0


In [7]:
# Mother tongue proportion
religion_dataset['p'] = religion_dataset.apply(
    lambda row: row.n / total_persons_pers_state.loc[row.name[0]], axis=1
)
religion_dataset.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p
State,Religion,Unnamed: 2_level_1,Unnamed: 3_level_1
JAMMU & KASHMIR,Hindu,3566674.0,0.284885
JAMMU & KASHMIR,Islam/Muslim,8567485.0,0.68432
JAMMU & KASHMIR,Christian,35631.0,0.002846
JAMMU & KASHMIR,Sikh,234848.0,0.018758
JAMMU & KASHMIR,Buddhist,112584.0,0.008993


In [8]:
religion_dataset['ln_p'] = religion_dataset['p'].apply(lambda value: math.log(value))
religion_dataset.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p,ln_p
State,Religion,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
JAMMU & KASHMIR,Hindu,3566674.0,0.284885,-1.255671
JAMMU & KASHMIR,Islam/Muslim,8567485.0,0.68432,-0.37933
JAMMU & KASHMIR,Christian,35631.0,0.002846,-5.861844
JAMMU & KASHMIR,Sikh,234848.0,0.018758,-3.976121
JAMMU & KASHMIR,Buddhist,112584.0,0.008993,-4.71136


In [9]:
religion_dataset['p_ln_p'] = religion_dataset['p'] * religion_dataset['ln_p']
religion_dataset.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p,ln_p,p_ln_p
State,Religion,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
JAMMU & KASHMIR,Hindu,3566674.0,0.284885,-1.255671,-0.357721
JAMMU & KASHMIR,Islam/Muslim,8567485.0,0.68432,-0.37933,-0.259583
JAMMU & KASHMIR,Christian,35631.0,0.002846,-5.861844,-0.016683
JAMMU & KASHMIR,Sikh,234848.0,0.018758,-3.976121,-0.074585
JAMMU & KASHMIR,Buddhist,112584.0,0.008993,-4.71136,-0.042367


In [10]:
shannon_index = pd.DataFrame(religion_dataset['p_ln_p'].groupby(level=0).sum() * -1)
shannon_index.reset_index(inplace=True)
religion_si = shannon_index.copy()
religion_si.rename({'State': 'state'}, inplace=True, axis=1)
religion_si.head()

Unnamed: 0,state,p_ln_p
0,ANDAMAN & NICOBAR ISLANDS,0.817929
1,ANDHRA PRADESH,0.399206
2,ARUNACHAL PRADESH,1.146343
3,ASSAM,0.810371
4,BIHAR,0.469847


In [11]:
religion_si.to_csv('religion_si.csv', index=False)