In [3]:
# importing required libarries
import pandas as pd
import numpy as np

# Occupation

In [None]:
# loading data
occupation_dataset = pd.read_csv("./data/occupation.csv")
occupation_dataset.head()

In [None]:
# dataset shape
occupation_dataset.shape

In [None]:
# Selecting only records where Total == Total
occupation_dataset = occupation_dataset[occupation_dataset['Total'] == 'Total']

# reseting index
occupation_dataset.reset_index(drop=True, inplace=True)

# glimpse of dataset
occupation_dataset.head()

In [None]:
# dataset shape
occupation_dataset.shape

In [None]:
# removing total column as it is constant
print(occupation_dataset['Total'].unique())

del occupation_dataset['Total']

occupation_dataset.head()

In [None]:
# unique values
occupation_dataset.State.unique()

In [None]:
# Removing 'STATE - ' from state columns
# occupation_dataset['State'].str.split(' - ').apply(lambda x: len(x)).unique()
occupation_dataset['State'] = occupation_dataset['State'].str.strip().str.split(' - ').apply(lambda array: array[1])
occupation_dataset.head()

In [None]:
# Trailing spaces also removed
occupation_dataset.State.unique()

In [None]:
# Removing TOTAL
occupation_dataset = occupation_dataset[occupation_dataset['Occupation classification'] != 'TOTAL']
occupation_dataset.reset_index(drop=True, inplace=True)
occupation_dataset.head()

In [None]:
'TOTAL' in occupation_dataset['Occupation classification'].unique()

In [None]:
occupation_dataset.to_csv("occupation_cleaned.csv", index=False)

# Mother tongue diversity

In [None]:
# loading data: mtd -> mother tongue diversity
mtd_dataset = pd.read_csv("./data/mother tongue diversity.csv")
mtd_dataset.head()

In [None]:
# dataset shape
mtd_dataset.shape

In [None]:
# Selecting only the main languages... All main languagues are in cap... OTHERS will be removed too
languages = mtd_dataset['Mother tongue name'].values

mtd_dataset = mtd_dataset[[
    str.isupper(language) \
    if "OTHERS" not in language \
    else False \
    for language in languages
]]

mtd_dataset.reset_index(drop=True, inplace=True)
mtd_dataset.head()

In [None]:
# Removing Numbers from language name
mtd_dataset['Mother tongue name'] = mtd_dataset['Mother tongue name'] \
.str.strip().str.split(' ').apply(lambda value: value[1])

mtd_dataset.head()

In [None]:
# dataset shape
mtd_dataset.shape

In [None]:
mtd_dataset['Mother tongue name'].unique()

In [None]:
mtd_dataset.to_csv("mother_tongue_diversity_cleaned.csv", index=False)

# Migration

In [None]:
# loading dataset
migration_dataset = pd.read_csv('./data/migration.csv')
migration_dataset.head()

In [None]:
# dataset shape
migration_dataset.head()

In [None]:
# dataset fields
migration_dataset.columns

In [None]:
# dataset shape
migration_dataset.shape

In [None]:
# selecting records where previous residence or current residence are neither rural nor urban
migration_dataset = migration_dataset[
    ~((migration_dataset['Previous residence (total)'].isin(('Rural', 'Urban'))) | \
      (migration_dataset['Current state (total)'].isin(('Rural', 'Urban'))))
]

migration_dataset.head()

In [None]:
# dataset shape
migration_dataset.shape

In [None]:
migration_dataset['Previous residence (total)'].unique()

In [None]:
migration_dataset['Current state (total)'].unique()

In [None]:
# unique previous residences
migration_dataset['Previous residence'].unique()

In [None]:
# Previous residence categories to be removed
categories_to_be_removed = [
    'Total', 'Last residence within India',
    'Within the state of enumeration but outside the place of enumeration',
    'Elsewhere in the district of enumeration',
    'In other districts of the state of enumeration',
    'States in India beyond the state of enumeration'
]

migration_dataset = migration_dataset[
    ~(migration_dataset['Previous residence'].isin(categories_to_be_removed))
]
migration_dataset.head()

In [None]:
# dataset shape
migration_dataset.shape

In [None]:
# unique previous residences
migration_dataset['Previous residence'].unique()

In [None]:
# Moving country names from Previous residence (total) to Previous residence
def replace_with_country_name(row):
    
    # Total is the normal value in Previous residence (total). Anything other than total
    # will be moved used to replace the value of Previous residence in the same record 
    prev_residence_total = row['Previous residence (total)']
    
    if prev_residence_total != 'Total':
        temp_row = row.copy()
        temp_row['Previous residence'] = prev_residence_total
        return temp_row
    return row

migration_dataset = migration_dataset.apply(replace_with_country_name, axis=1)
migration_dataset.head()

In [None]:
# unique previous residences
migration_dataset['Previous residence'].unique()

In [None]:
# Previous residence categories to be removed
categories2_to_be_removed = [
    'Last residence outside India',
    'Countries in Asia beyond India',
    'Countries in Africa',
    'Countries in the Americas',
    'Countries in Oceania',
    'Unclassifiable',
    'Elsewhere'
]

migration_dataset = migration_dataset[
    ~(migration_dataset['Previous residence'].isin(categories2_to_be_removed))
]
migration_dataset.head()

In [None]:
# dataset shape
migration_dataset.shape

In [None]:
# unique previous residences
migration_dataset['Previous residence'].unique()

In [None]:
# Removing 'State - ' from state columns
# occupation_dataset['State'].str.split(' - ').apply(lambda x: len(x)).unique()
migration_dataset['Current state'] = migration_dataset['Current state'].str.strip().str.split(' - ').apply(lambda array: array[1])
migration_dataset.head()

In [None]:
# Removing ' (' from state columns
# occupation_dataset['State'].str.split(' - ').apply(lambda x: len(x)).unique()
migration_dataset['Current state'] = migration_dataset['Current state'].str.strip().str.split(' \(').apply(lambda array: array[0])
migration_dataset.head()

In [None]:
migration_dataset['Current state'].unique()

In [None]:
# Removing comma from Persons and converting it to integer
migration_dataset['Persons'] = migration_dataset['Persons'].str.replace(",", "").astype("int")
migration_dataset.head()

In [None]:
# removing Previous residence (total) and Current state (total)
del migration_dataset['Previous residence (total)']
del migration_dataset['Current state (total)']
migration_dataset.head()

In [None]:
migration_dataset.to_csv("migration_cleaned.csv", index=False)

# Multilingualism

In [None]:
# loading dataset
multi_lingual_dataset = pd.read_csv(
    "./data/multilingualism.csv",
)
multi_lingual_dataset.head()

In [None]:
# dataset shape
multi_lingual_dataset.shape

In [None]:
# replacing all nan's with the previous language

for col in ['Mother tongue', 'mPersons', 'Second language', 'sPersons']:
    new_values = []
    for language in multi_lingual_dataset[col].values:
        if str(language) != 'nan':
            current_language = language

        new_values.append(str(current_language).strip())
    
    multi_lingual_dataset[col] = new_values

multi_lingual_dataset.head()

In [None]:
# setting multi index
multi_lingual_dataset = multi_lingual_dataset.set_index(['Mother tongue', 'Second language'])
multi_lingual_dataset

In [None]:
# Deleting OTHERS
multi_lingual_dataset = multi_lingual_dataset.drop('OTHERS', level=0)
multi_lingual_dataset = multi_lingual_dataset.drop('OTHERS', level=1)
multi_lingual_dataset

In [None]:
multi_lingual_dataset = multi_lingual_dataset[multi_lingual_dataset['Third language'] != 'OTHERS']
multi_lingual_dataset

In [None]:
multi_lingual_dataset['Third language'].isna().value_counts()

In [None]:
multi_lingual_dataset.dropna(inplace=True)

In [None]:
multi_lingual_dataset['Third language'].isna().value_counts()

In [None]:
multi_lingual_dataset.to_csv("multilingualism_cleaned.csv")

In [None]:
!open .

# Religion

In [None]:
# loading dataset
religion_dataset = pd.read_csv('./data/religion.csv')
religion_dataset.head()

In [None]:
# dataset shape
religion_dataset.shape

In [None]:
# droping 'All religious communities'
religion_dataset = religion_dataset[religion_dataset['Religious Community'] != 'All Religious Communities']
religion_dataset.head()

In [None]:
# dataset shape
religion_dataset.shape

In [None]:
# Selecting all values where Total == Total
religion_dataset = religion_dataset[religion_dataset['Total'] == 'Total']
religion_dataset.head()

In [None]:
# dataset shape
religion_dataset.shape

In [None]:
# Religious codes will for manjour religions are one digits
religion_codes = religion_dataset['Religion Code'].unique()
religion_codes.sort()
religion_codes

In [None]:
# Selecting only major religions
religion_dataset = religion_dataset[religion_dataset['Religion Code'].isin(range(1, 7))]
religion_dataset.head()

In [None]:
# dataset shape
religion_dataset.shape

In [None]:
# removing Total and Religion Code columns
del religion_dataset['Total']
del religion_dataset['Religion Code']
religion_dataset.head()

In [None]:
# Changing delhi to nct
religion_dataset['State'].replace("DELHI", "NCT OF DELHI", inplace=True)

In [None]:
religion_dataset['state'] = religion_dataset['State']

In [None]:
religion_dataset['Religious Community'].unique()

# Religion - 2011

In [4]:
# loading data
religion = pd.read_csv('./data/religion.csv')
religion.head()

Unnamed: 0,State,Religion,Total,Persons
0,,,,
1,State - JAMMU & KASHMIR,All Religious Community:,Total,12541302.0
2,State - JAMMU & KASHMIR,All Religious Community:,Rural,9108060.0
3,State - JAMMU & KASHMIR,All Religious Community:,Urban,3433242.0
4,State - JAMMU & KASHMIR,Religion:Hindu,Total,3566674.0


In [5]:
# dataset shape
religion.shape

(2911, 4)

In [6]:
# droping 'All religious communities'
religion = religion[religion['Religion'] != 'All Religious Community:']
religion.head()

Unnamed: 0,State,Religion,Total,Persons
0,,,,
4,State - JAMMU & KASHMIR,Religion:Hindu,Total,3566674.0
5,State - JAMMU & KASHMIR,Religion:Hindu,Rural,2516370.0
6,State - JAMMU & KASHMIR,Religion:Hindu,Urban,1050304.0
7,State - JAMMU & KASHMIR,Sect:Hindu,Total,3566520.0


In [7]:
# dataset shape
religion.shape

(2806, 4)

In [8]:
# Selecting all values where Total == Total
religion = religion[religion['Total'] == 'Total']
religion.head()

Unnamed: 0,State,Religion,Total,Persons
4,State - JAMMU & KASHMIR,Religion:Hindu,Total,3566674.0
7,State - JAMMU & KASHMIR,Sect:Hindu,Total,3566520.0
10,State - JAMMU & KASHMIR,Sect:Bairagi,Total,1.0
13,State - JAMMU & KASHMIR,Sect:Balmiki / Walmiki / Valmiki,Total,41.0
16,State - JAMMU & KASHMIR,Sect:Kabir Panthi,Total,104.0


In [9]:
# Filtering all main religions
religion = religion[religion['Religion'].str.contains('Religion:')]
religion['Religion'] = religion['Religion'].str.split('Religion:').apply(lambda array: array[1])
religion['State'] = religion['State'].str.strip().str.split('State - ').apply(lambda array: array[1])
religion.head()

Unnamed: 0,State,Religion,Total,Persons
4,JAMMU & KASHMIR,Hindu,Total,3566674.0
22,JAMMU & KASHMIR,Islam/Muslim,Total,8567485.0
34,JAMMU & KASHMIR,Christian,Total,35631.0
43,JAMMU & KASHMIR,Sikh,Total,234848.0
49,JAMMU & KASHMIR,Buddhist,Total,112584.0


In [10]:
del religion['Total']

In [11]:
religion.head()

Unnamed: 0,State,Religion,Persons
4,JAMMU & KASHMIR,Hindu,3566674.0
22,JAMMU & KASHMIR,Islam/Muslim,8567485.0
34,JAMMU & KASHMIR,Christian,35631.0
43,JAMMU & KASHMIR,Sikh,234848.0
49,JAMMU & KASHMIR,Buddhist,112584.0


In [14]:
religion.to_csv('religion_cleaned.csv', index=False)