In [1]:
# importing libraries
import pandas as pd
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [2]:
# loading data
migration_dataset = pd.read_csv("./data/SDI/migration_sdi.csv")
mtd_dataset = pd.read_csv("./data/SDI/mother_tongue_diversity_sdi.csv")
occupation_dataset = pd.read_csv("./data/SDI/occupation_sdi.csv")
religion_dataset = pd.read_csv("./data/SDI/religion_sdi.csv")

In [3]:
# preserving only state and SDI
migration_dataset = migration_dataset[['state', 'migration_SDI']]
religion_dataset = religion_dataset[['state', 'religion_SDI']]
occupation_dataset = occupation_dataset[['state', 'occupation_SDI']]
mtd_dataset = mtd_dataset[['state', 'mtd_SDI']]

In [4]:
# checking the shape of each dataset
migration_dataset.shape

(35, 2)

In [5]:
mtd_dataset.shape

(35, 2)

In [6]:
occupation_dataset.shape

(35, 2)

In [7]:
religion_dataset.shape

(35, 2)

In [8]:
# concatenating all data frames
sdi_table = pd.DataFrame()
sdi_table['state'] = occupation_dataset['state']
sdi_table['occupation_SDI'] = occupation_dataset['occupation_SDI']
sdi_table['religion_SDI'] = religion_dataset['religion_SDI']
sdi_table['mtd_SDI'] = mtd_dataset['mtd_SDI']
sdi_table['migration_SDI'] = migration_dataset['migration_SDI']
sdi_table.head()

Unnamed: 0,state,occupation_SDI,religion_SDI,mtd_SDI,migration_SDI
0,ANDAMAN & NICOBAR ISLANDS,0.94612,0.461707,0.825498,0.843011
1,ANDHRA PRADESH,0.945958,0.200286,0.29216,0.844103
2,ARUNACHAL PRADESH,0.929474,0.64626,0.849312,0.606677
3,ASSAM,0.929643,0.501187,0.670387,0.873683
4,BIHAR,0.906204,0.284158,0.375566,0.791525


In [None]:
# calculation of cultural diversity = average of other diversities
sdi_table['cultural_diversity'] = sdi_table[['occupation_SDI', 'migration_SDI', 'mtd_SDI', 'religion_SDI']].mean(axis=1)
sdi_table.head()

In [None]:
sdi_table.to_csv('cultural_diversity_sdi.csv', index=False)

## Clustering states based on diversity indices

In [None]:
# PCA object
pca = PCA(n_components=2)

In [None]:
# # Transformation
# components = pca.fit_transform(
#     StandardScaler().fit_transform(
#         sdi_table[['occupation_SDI', 'migration_SDI', 'mtd_SDI', 'religion_SDI']]
#     )
# )
# data = pd.DataFrame(components, columns=['x', 'y'])

# Transformation
components = pca.fit_transform(
    sdi_table[['occupation_SDI', 'migration_SDI', 'mtd_SDI', 'religion_SDI']]
)
data = pd.DataFrame(components, columns=['x', 'y'])

In [None]:
# plotting components
plt.figure(figsize=(16, 8))
sns.scatterplot(data=data, x='x', y='y')

In [None]:
# Finding the optimal cluster
initias = []
for n in range(1, 11):
    c = KMeans(n)
    c.fit_transform(components)
    initias.append(c.inertia_)
    
plt.figure(figsize=(16, 8))
plt.plot(initias)

In [None]:
# Clustering based on the above result
cluster = KMeans(4)
output = cluster.fit_transform(components)
data['class'] = cluster.labels_

In [None]:
# Visualizing clusters
plt.figure(figsize=(16, 8))
sns.scatterplot(data=data, x='x', y='y', hue='class', palette=['red', 'green', 'black', 'yellow'])
plt.title("Clusterig of states based on their index")

In [None]:
# Visualizing classes on a bar chart
temp_data = sdi_table.copy()
temp_data['class'] = data['class']
temp_data.head()

In [None]:
plt.figure(figsize=(16, 8))
plt.xticks(rotation=90)
sns.barplot(data=temp_data, x='state', y='cultural_diversity', hue='class')

# Future work - Cultural diversity as a PCA of other diversity indices (dropped for now)