# Libraries

In [1]:
#importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import csv

# CSV File

In [2]:
#reading csv file
with open('../Data/BASIC_HABITATION_INFORMATION_AS_ON_1_APR_12.csv',"r") as f:
    reader = csv.reader(f,delimiter = ",")
    print(reader)
    data = list(reader)
    row_count = len(data)

column_name = data.pop(0)
column_name.insert(17,"temp")
df=pd.DataFrame(data,columns=column_name)
df.drop(columns=['temp'], axis = 1, inplace = True)

<_csv.reader object at 0x000001E6D4C22B38>


# Data Preprocessing

In [3]:
#noise removal(removal of data objects with out of format values(time format) for village names)
patterndel = " AM$| PM$|^[0-9]$"
name_remove =' LADE"$'
filter1 = df['Village Name'].str.contains(patterndel)
df = df[~filter1]
filter2 = df['SC Current Population'].str.contains(name_remove)
df = df[~filter2]

In [4]:
#dimensionality reduction
df.drop(['Year', 'Status', 'SC Concentrated', 'ST Concentrated'], axis = 1, inplace = True)

In [5]:
#converting all attributes to float value
df['SC Current Population']=df['SC Current Population'].astype(str).astype(float)
df['ST Current Population']=df['ST Current Population'].astype(str).astype(float)
df['GENERAL Current Population']=df['GENERAL Current Population'].astype(str).astype(float)
df['SC Covered Population']=df['SC Covered Population'].astype(str).astype(float)
df['ST Covered Population']=df['ST Covered Population'].astype(str).astype(float)
df['GENERAL Covered Population']=df['GENERAL Covered Population'].astype(str).astype(float)

In [6]:
#data cleaning
df.loc[df['GENERAL Current Population'] < 0, 'GENERAL Current Population'] = 0
df.loc[df['GENERAL Covered Population'] < 0, 'GENERAL Covered Population'] = 0
df.loc[df['SC Current Population'] < 0, 'SC Current Population'] = 0
df.loc[df['SC Covered Population'] < 0, 'SC Covered Population'] = 0
df.loc[df['ST Current Population'] < 0, 'ST Current Population'] = 0
df.loc[df['ST Covered Population'] < 0, 'ST Covered Population'] = 0
df.loc[df['ST Current Population'] < df['ST Covered Population'], 'ST Current Population'] = df['ST Covered Population']
df.loc[df['SC Current Population'] < df['SC Covered Population'], 'SC Current Population'] = df['SC Covered Population']
df.loc[df['GENERAL Current Population'] < df['GENERAL Covered Population'], 'GENERAL Current Population'] = df['GENERAL Covered Population']

In [7]:
#Feature Construction(Minority Covered Population, Minority Current Population, Total Current Population and Total Covered Population)
df['Minority Covered Population'] = df['SC Covered Population'] + df['ST Covered Population']
df['Minority Current Population'] = df['SC Current Population'] + df['ST Current Population']
df.drop(['SC Covered Population', 'SC Current Population', 'ST Covered Population', 'ST Current Population'],axis = 1, inplace = True)

add = pd.Series([df['GENERAL Current Population'], df['Minority Current Population']])
df['Total Current Population'] = add.sum()

add = pd.Series([df['GENERAL Covered Population'], df['Minority Covered Population']])
df['Total Covered Population'] = add.sum()

In [8]:
#clubbing values grouped by State Name and District Names
df2 = df.groupby(['State Name'],as_index=False).agg({'GENERAL Current Population': 'sum', 'GENERAL Covered Population': 'sum', 'Minority Current Population': 'sum', 'Minority Covered Population': 'sum', 'Total Current Population': 'sum', 'Total Covered Population': 'sum'})
df3 = df.groupby(['State Name', 'District Name'],as_index=False).agg({'GENERAL Current Population': 'sum', 'GENERAL Covered Population': 'sum', 'Minority Current Population': 'sum', 'Minority Covered Population': 'sum', 'Total Current Population': 'sum', 'Total Covered Population': 'sum'})

In [9]:
#manually filling values for missing states and UTs
telangana = pd.DataFrame(df2[df2['State Name'] == 'ANDHRA PRADESH'])
telangana['State Name'] = 'telangana'
delhi = pd.DataFrame(df2[df2['State Name'] == 'HARYANA'])
delhi['State Name'] = 'delhi ncr'

In [10]:
#concatenating missing values
frames = [df2, telangana, delhi]
df2 = pd.concat(frames)

In [11]:
#calculating ratios of required values for statewise visualisations
df2['gcg'] = df2['GENERAL Covered Population']/df2['GENERAL Current Population']
df2['mcm'] = df2['Minority Covered Population']/df2['Minority Current Population']
df2['tct'] = df2['Total Covered Population']/df2['Total Current Population']

#calculating ratios of required values for districtwise visualisations
df3['gcg'] = df3['GENERAL Covered Population']/df3['GENERAL Current Population']
df3['mcm'] = df3['Minority Covered Population']/df3['Minority Current Population']
df3['tct'] = df3['Total Covered Population']/df3['Total Current Population']

# Exporting data for Visualisations

In [12]:
#exporting dataframes for visualisations
df2.to_csv(r'../Data/States.csv', index = False)
df3.to_csv(r'../Data/Districts.csv', index = False)

# Data Analysis 

# Clustering

In [13]:
#creating dataframes for analysis
mi_df = df3[['State Name', 'District Name', 'Minority Covered Population', 'Minority Current Population', 'Total Covered Population', 'Total Current Population']]
ge_df = df3[['State Name', 'District Name', 'GENERAL Covered Population', 'GENERAL Current Population', 'Total Covered Population', 'Total Current Population']]
to_df = df3[['State Name', 'District Name','Total Covered Population', 'Total Current Population']]

mi_df['mi_conc'] = mi_df['Minority Current Population']/mi_df['Total Current Population']
mi_df['mi_cove'] = mi_df['Minority Covered Population']/mi_df['Minority Current Population']
ge_df['ge_conc'] = 1 - mi_df['mi_conc']
ge_df['ge_cove'] = ge_df['GENERAL Covered Population']/ge_df['GENERAL Current Population']
to_df['to_cove'] = to_df['Total Covered Population']/to_df['Total Current Population']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-d

In [14]:
#Applying k-means for minority population
points = mi_df[['mi_cove', 'State Name','District Name']]

centroids = np.asarray([0, 0.4, 0.7])
new_cen = np.asarray([0, 0, 0])

col_names =  ['mi_cove', 'State Name', 'District Name']
mi_c1 = pd.DataFrame(columns = col_names)
mi_c2 = pd.DataFrame(columns = col_names)
mi_c3 = pd.DataFrame(columns = col_names)
for i in range(points['mi_cove'].count()):
    temp = centroids
    temp = abs(temp - points.iloc[i]['mi_cove'])
    val = min(temp)
    if val == abs(centroids[0] - points.iloc[i]['mi_cove']):
        mi_c1 = mi_c1.append({'mi_cove': points.iloc[i]['mi_cove'], 'State Name': points.iloc[i]['State Name'], 'District Name':points.iloc[i]['District Name']}, ignore_index = True)
    elif val == abs(centroids[1] - points.iloc[i]['mi_cove']):
        mi_c2 = mi_c2.append({'mi_cove': points.iloc[i]['mi_cove'], 'State Name': points.iloc[i]['State Name'], 'District Name':points.iloc[i]['District Name']}, ignore_index = True)
    elif val == abs(centroids[2] - points.iloc[i]['mi_cove']):
        mi_c3 = mi_c3.append({'mi_cove': points.iloc[i]['mi_cove'], 'State Name': points.iloc[i]['State Name'], 'District Name':points.iloc[i]['District Name']}, ignore_index = True)
v1 = (sum(np.asarray(mi_c1['mi_cove']))) / len(np.asarray(mi_c1['mi_cove']))
v2 = (sum(np.asarray(mi_c2['mi_cove']))) / len(np.asarray(mi_c2['mi_cove']))
v3 = (sum(np.asarray(mi_c3['mi_cove']))) / len(np.asarray(mi_c3['mi_cove']))
new_cen = np.asarray([v1,v2,v3])

In [15]:
var =0
while not (new_cen == centroids).all():
    centroids = new_cen
    col_names =  ['mi_cove', 'State Name', 'District Name']
    mi_c1 = pd.DataFrame(columns = col_names)
    mi_c2 = pd.DataFrame(columns = col_names)
    mi_c3 = pd.DataFrame(columns = col_names)
    for i in range(points['mi_cove'].count()):
        temp = centroids
        temp = abs(temp - points.iloc[i]['mi_cove'])
        val = min(temp)
        if val == abs(centroids[0] - points.iloc[i]['mi_cove']):
            mi_c1 = mi_c1.append({'mi_cove': points.iloc[i]['mi_cove'], 'State Name': points.iloc[i]['State Name'], 'District Name':points.iloc[i]['District Name']}, ignore_index = True)
        elif val == abs(centroids[1] - points.iloc[i]['mi_cove']):
            mi_c2 = mi_c2.append({'mi_cove': points.iloc[i]['mi_cove'], 'State Name': points.iloc[i]['State Name'], 'District Name':points.iloc[i]['District Name']}, ignore_index = True)
        elif val == abs(centroids[2] - points.iloc[i]['mi_cove']):
            mi_c3 = mi_c3.append({'mi_cove': points.iloc[i]['mi_cove'], 'State Name': points.iloc[i]['State Name'], 'District Name':points.iloc[i]['District Name']}, ignore_index = True)
    v1 = (sum(np.asarray(mi_c1['mi_cove']))) / len(np.asarray(mi_c1['mi_cove']))
    v2 = (sum(np.asarray(mi_c2['mi_cove']))) / len(np.asarray(mi_c2['mi_cove']))
    v3 = (sum(np.asarray(mi_c3['mi_cove']))) / len(np.asarray(mi_c3['mi_cove']))
    new_cen = np.asarray([v1,v2,v3])
    var += 1
    
new_cen

array([0.40818009, 0.74441645, 0.9459526 ])

In [16]:
#Applying k-means for genenral population
points = ge_df[['ge_cove', 'State Name','District Name']]

centroids = np.asarray([0, 0.4, 0.7])
new_cen = np.asarray([0, 0, 0])

col_names =  ['ge_cove','State Name', 'District Name']
ge_c1 = pd.DataFrame(columns = col_names)
ge_c2 = pd.DataFrame(columns = col_names)
ge_c3 = pd.DataFrame(columns = col_names)
for i in range(points['ge_cove'].count()):
    temp = centroids
    temp = abs(temp - points.iloc[i]['ge_cove'])
    val = min(temp)
    if val == abs(centroids[0] - points.iloc[i]['ge_cove']):
        ge_c1 = ge_c1.append({'ge_cove': points.iloc[i]['ge_cove'], 'State Name': points.iloc[i]['State Name'], 'District Name':points.iloc[i]['District Name']}, ignore_index = True)
    elif val == abs(centroids[1] - points.iloc[i]['ge_cove']):
        ge_c2 = ge_c2.append({'ge_cove': points.iloc[i]['ge_cove'], 'State Name': points.iloc[i]['State Name'], 'District Name':points.iloc[i]['District Name']}, ignore_index = True)
    elif val == abs(centroids[2] - points.iloc[i]['ge_cove']):
        ge_c3 = ge_c3.append({'ge_cove': points.iloc[i]['ge_cove'], 'State Name': points.iloc[i]['State Name'], 'District Name':points.iloc[i]['District Name']}, ignore_index = True)
v1 = (sum(np.asarray(ge_c1['ge_cove']))) / len(np.asarray(ge_c1['ge_cove']))
v2 = (sum(np.asarray(ge_c2['ge_cove']))) / len(np.asarray(ge_c2['ge_cove']))
v3 = (sum(np.asarray(ge_c3['ge_cove']))) / len(np.asarray(ge_c3['ge_cove']))
new_cen = np.asarray([v1,v2,v3])

In [17]:
var =0
while not (new_cen == centroids).all():
    centroids = new_cen
    col_names =  ['ge_cove', 'State Name', 'District Name']
    ge_c1 = pd.DataFrame(columns = col_names)
    ge_c2 = pd.DataFrame(columns = col_names)
    ge_c3 = pd.DataFrame(columns = col_names)
    for i in range(points['ge_cove'].count()):
        temp = centroids
        temp = abs(temp - points.iloc[i]['ge_cove'])
        val = min(temp)
        if val == abs(centroids[0] - points.iloc[i]['ge_cove']):
            ge_c1 = ge_c1.append({'ge_cove': points.iloc[i]['ge_cove'], 'State Name': points.iloc[i]['State Name'], 'District Name':points.iloc[i]['District Name']}, ignore_index = True)
        elif val == abs(centroids[1] - points.iloc[i]['ge_cove']):
            ge_c2 = ge_c2.append({'ge_cove': points.iloc[i]['ge_cove'], 'State Name': points.iloc[i]['State Name'], 'District Name':points.iloc[i]['District Name']}, ignore_index = True)
        elif val == abs(centroids[2] - points.iloc[i]['ge_cove']):
            ge_c3 = ge_c3.append({'ge_cove': points.iloc[i]['ge_cove'], 'State Name': points.iloc[i]['State Name'], 'District Name':points.iloc[i]['District Name']}, ignore_index = True)
    v1 = (sum(np.asarray(ge_c1['ge_cove']))) / len(np.asarray(ge_c1['ge_cove']))
    v2 = (sum(np.asarray(ge_c2['ge_cove']))) / len(np.asarray(ge_c2['ge_cove']))
    v3 = (sum(np.asarray(ge_c3['ge_cove']))) / len(np.asarray(ge_c3['ge_cove']))
    new_cen = np.asarray([v1,v2,v3])
    var += 1
    
new_cen

array([0.29885478, 0.72728295, 0.94431376])

In [18]:
#Applying k-means for total population
points = to_df[['to_cove', 'State Name','District Name']]

centroids = np.asarray([0, 0.4, 0.7])
new_cen = np.asarray([0, 0, 0])

col_names =  ['to_cove','State Name', 'District Name']
to_c1 = pd.DataFrame(columns = col_names)
to_c2 = pd.DataFrame(columns = col_names)
to_c3 = pd.DataFrame(columns = col_names)
for i in range(points['to_cove'].count()):
    temp = centroids
    temp = abs(temp - points.iloc[i]['to_cove'])
    val = min(temp)
    if val == abs(centroids[0] - points.iloc[i]['to_cove']):
        to_c1 = to_c1.append({'to_cove': points.iloc[i]['to_cove'], 'State Name': points.iloc[i]['State Name'], 'District Name':points.iloc[i]['District Name']}, ignore_index = True)
    elif val == abs(centroids[1] - points.iloc[i]['to_cove']):
        to_c2 = to_c2.append({'to_cove': points.iloc[i]['to_cove'], 'State Name': points.iloc[i]['State Name'], 'District Name':points.iloc[i]['District Name']}, ignore_index = True)
    elif val == abs(centroids[2] - points.iloc[i]['to_cove']):
        to_c3 = to_c3.append({'to_cove': points.iloc[i]['to_cove'], 'State Name': points.iloc[i]['State Name'], 'District Name':points.iloc[i]['District Name']}, ignore_index = True)
v1 = (sum(np.asarray(to_c1['to_cove']))) / len(np.asarray(to_c1['to_cove']))
v2 = (sum(np.asarray(to_c2['to_cove']))) / len(np.asarray(to_c2['to_cove']))
v3 = (sum(np.asarray(to_c3['to_cove']))) / len(np.asarray(to_c3['to_cove']))
new_cen = np.asarray([v1,v2,v3])

In [19]:
var =0
while not (new_cen == centroids).all():
    centroids = new_cen
    col_names =  ['to_cove', 'State Name', 'District Name']
    to_c1 = pd.DataFrame(columns = col_names)
    to_c2 = pd.DataFrame(columns = col_names)
    to_c3 = pd.DataFrame(columns = col_names)
    for i in range(points['to_cove'].count()):
        temp = centroids
        temp = abs(temp - points.iloc[i]['to_cove'])
        val = min(temp)
        if val == abs(centroids[0] - points.iloc[i]['to_cove']):
            to_c1 = to_c1.append({'to_cove': points.iloc[i]['to_cove'], 'State Name': points.iloc[i]['State Name'], 'District Name':points.iloc[i]['District Name']}, ignore_index = True)
        elif val == abs(centroids[1] - points.iloc[i]['to_cove']):
            to_c2 = to_c2.append({'to_cove': points.iloc[i]['to_cove'], 'State Name': points.iloc[i]['State Name'], 'District Name':points.iloc[i]['District Name']}, ignore_index = True)
        elif val == abs(centroids[2] - points.iloc[i]['to_cove']):
            to_c3 = to_c3.append({'to_cove': points.iloc[i]['to_cove'], 'State Name': points.iloc[i]['State Name'], 'District Name':points.iloc[i]['District Name']}, ignore_index = True)
    v1 = (sum(np.asarray(to_c1['to_cove']))) / len(np.asarray(to_c1['to_cove']))
    v2 = (sum(np.asarray(to_c2['to_cove']))) / len(np.asarray(to_c2['to_cove']))
    v3 = (sum(np.asarray(to_c3['to_cove']))) / len(np.asarray(to_c3['to_cove']))
    new_cen = np.asarray([v1,v2,v3])
    var += 1
    
new_cen

array([0.10957125, 0.71043734, 0.93846793])

# Answering the Questions

In [20]:
min_con = df3[(df3['GENERAL Current Population']<df3['Minority Current Population'])]
gen_con = df3[(df3['GENERAL Current Population']>df3['Minority Current Population'])]

# minority concentrated --> area less covered

In [21]:
#RELATION BETWEEN AREAS BEING MINORITY CONCENTRATED AND THE AREAS NOT COVERED PROPERLY i.e. lower two clusters
ans1 = pd.merge(min_con,to_c1,how ='inner',left_on=['State Name','District Name'],right_on=['State Name','District Name'])
ans2 = pd.merge(min_con,to_c2,how ='inner',left_on=['State Name','District Name'],right_on=['State Name','District Name'])
ans = (ans1['District Name'].count() + ans2['District Name'].count())/min_con['District Name'].count()
ans
#ans is the support of rule (minority concentrated --> area less covered)

0.4566929133858268

# general concentrated --> minority less covered

In [22]:
#RELATION BETWEEN GENERAL POPULATION CONCENTRATED AREAS AND MINORITIES NOT HAVING ACCESS TO CLEAN DRINKING WATER
ans1 = pd.merge(gen_con,mi_c1,how ='inner',left_on=['State Name','District Name'],right_on=['State Name','District Name'])
ans2 = pd.merge(gen_con,mi_c2,how ='inner',left_on=['State Name','District Name'],right_on=['State Name','District Name'])
ans = (ans1['District Name'].count() + ans2['District Name'].count())/gen_con['District Name'].count()
ans
#ans is the support of rule (general concentrated --> minority less covered)

0.28627450980392155

# general concentrated --> area being covered

In [23]:
#RELATION BETWEEN GENERAL POPULATION CONCENTRATED AREAS AND AREA BEING IN THE TOP COVERED CLUSTER
ans1 = pd.merge(gen_con,to_c3,how ='inner',left_on=['State Name','District Name'],right_on=['State Name','District Name'])
ans = ans1['District Name'].count()/gen_con['District Name'].count()
ans
#ans is the support of rule (general concentrated --> area being covered)

0.7490196078431373

# minority concentrated --> general less covered

In [24]:
#RELATION BETWEEN MINORITY POPULATION CONCENTRATED AREAS AND GENERAL NOT HAVING CLEAN ACCES TO DRINKING WATER
ans1 = pd.merge(min_con,ge_c1,how ='inner',left_on=['State Name','District Name'],right_on=['State Name','District Name'])
ans2 = pd.merge(min_con,ge_c2,how ='inner',left_on=['State Name','District Name'],right_on=['State Name','District Name'])
ans = (ans1['District Name'].count() + ans2['District Name'].count())/min_con['District Name'].count()
ans
#ans is the support of rule (minority concentrated --> general less covered)

0.3937007874015748

# minority less covered --> total less covered

In [25]:
ans1 = pd.merge(mi_c1,to_c1,how ='inner',left_on=['State Name','District Name'],right_on=['State Name','District Name'])
ans = (ans1['District Name'].count())/mi_c1['District Name'].count()
ans
#ans is the support of rule (minority less covered --> total less covered)

0.20833333333333334

# general less covered --> total less covered

In [26]:
ans1 = pd.merge(ge_c1,to_c1,how ='inner',left_on=['State Name','District Name'],right_on=['State Name','District Name'])
ans = (ans1['District Name'].count())/ge_c1['District Name'].count()
ans
#ans is the support of rule (general less covered --> total less covered)

0.26666666666666666

# minority covered --> general covered

In [29]:
ans1 = pd.merge(mi_c3,ge_c3,how ='inner',left_on=['State Name','District Name'],right_on=['State Name','District Name'])
ans = (ans1['District Name'].count())/mi_c3['District Name'].count()
ans
#ans is the support of rule (minority covered --> general covered)

0.9410377358490566

# general covered --> minority covered

In [30]:
ans1 = pd.merge(ge_c3,mi_c3,how ='inner',left_on=['State Name','District Name'],right_on=['State Name','District Name'])
ans = (ans1['District Name'].count())/ge_c3['District Name'].count()
ans
#ans is the support of rule (general covered --> minority covered)

0.9236111111111112

# minority less covered --> general covered

In [32]:
#RELATION BETWEEN GENERAL POPULATION CONCENTRATED AREAS AND MINORITIES NOT HAVING ACCESS TO CLEAN DRINKING WATER
ans1 = pd.merge(mi_c1,ge_c3,how ='inner',left_on=['State Name','District Name'],right_on=['State Name','District Name'])
ans2 = pd.merge(mi_c2,ge_c3,how ='inner',left_on=['State Name','District Name'],right_on=['State Name','District Name'])
ans = (ans1['District Name'].count() + ans2['District Name'].count())/(mi_c1['District Name'].count()+mi_c2['District Name'].count())
ans
#ans is the support of rule (general concentrated --> minority less covered)

0.15492957746478872