## import modules

In [1]:
import pandas as pd
from scipy import stats
import numpy as np

## census data processing

In [2]:
# for population data
cols=['State', 'Level', 'Name', 'TRU', 'No_HH', 'TOT_P', 'TOT_M', 'TOT_F']
census=pd.read_excel('datasets/census.xlsx',engine='openpyxl',usecols=cols)
censusIndia=census.iloc[0:3,:]
census=census.loc[census.Level=='STATE']
census=census.append(censusIndia,ignore_index=True)
census.sort_values(by=['State'],axis=0,inplace=True)
census.reset_index(drop=True,inplace=True)

## read census language dataset[C-18]

In [3]:
c18=pd.read_excel('datasets/C-18.xlsx',engine='openpyxl',skiprows=6,header=None)

## calculate p-values for three parts

**`what is being done in code? Overall description is:`**
- first we store state-names corresponding to state code so that we can see which states have significant difference in ratios
- now we run through each state using its code as identifier to get relevent informations and then store them into lists
    - to get urban and rural total pop I have used census data of 2011
    - to get particular {part} language population I have used `C-18` file
    - to get exactly two and only one language population I have used same concept as used in Q1[described in getRatio func comments]
    - urban percent = 100*(urban population of particular {part:3+,excatly-2,only-1} from state )/(total urban pop of that stae) 
    - similarly for rural part
- note: every item of list is a dict conataining relevent info
- I just simple convert it into a pandas df and save it into a csv file

In [4]:
STATE_NAMES=[]
for state in c18.iloc[:,2].values:
    if not (state in STATE_NAMES):
        STATE_NAMES.append(state)

In [5]:
# useful_data=[]
tri_list=[]
bi_list=[]
uni_list=[]
for i,state in enumerate(STATE_NAMES):
    
    # here i is the state code
    urban_pop=census[(census['State']==i) & (census['TRU']=='Urban')]['TOT_P'].values[0]
    rural_pop=census[(census['State']==i) & (census['TRU']=='Rural')]['TOT_P'].values[0]
    
    # tri
    tri_urban=c18[(c18.iloc[:,0]==i) & (c18.iloc[:,4]=='Total') & (c18.iloc[:,3]=='Urban')].iloc[0,8]
    tri_rural=c18[(c18.iloc[:,0]==i) & (c18.iloc[:,4]=='Total') & (c18.iloc[:,3]=='Rural')].iloc[0,8]

    #bi
    bi_urban=c18[(c18.iloc[:,0]==i) & (c18.iloc[:,4]=='Total') & (c18.iloc[:,3]=='Urban')].iloc[0,5] - tri_urban
    bi_rural=c18[(c18.iloc[:,0]==i) & (c18.iloc[:,4]=='Total') & (c18.iloc[:,3]=='Rural')].iloc[0,5] - tri_rural

    #uni
    uni_urban=urban_pop-bi_urban-tri_urban
    uni_rural=rural_pop-bi_rural-tri_rural

   
    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html   
    # p_value=stats.ttest_ind([urban_pop/rural_pop]*3,[tri_urban/tri_rural,bi_urban/bi_rural,uni_urban/uni_rural]).pvalue
    p_value=stats.ttest_1samp([tri_urban/tri_rural,bi_urban/bi_rural,uni_urban/uni_rural],[urban_pop/rural_pop]).pvalue[0]

    tri_item={
        'state-code':i,
        'urban-percentage':round(100*tri_urban/urban_pop,2),
        'rural-percentage':round(100*tri_rural/rural_pop,2),
        'p-value':p_value
    }

    bi_item={
        'state-code':i,
        'urban-percentage':round(100*bi_urban/urban_pop,2),
        'rural-percentage':round(100*bi_rural/rural_pop,2),
        'p-value':p_value
    }

    uni_item={
        'state-code':i,
        'urban-percentage':round(100*uni_urban/urban_pop,2),
        'rural-percentage':round(100*uni_rural/rural_pop,2),
        'p-value':p_value
    }
    tri_list.append(tri_item)
    bi_list.append(bi_item)
    uni_list.append(uni_item)

    # item={
    #     'state-code':i,
    #     'state-name':state,
    #     'p-value':p_value,
    #     'urban-to-rural':urban_pop/rural_pop,
    #     'tri-urban-to-rural-ratio':tri_urban/tri_rural,
    #     'bi-urban-to-rural-ratio':bi_urban/bi_rural,
    #     'uni-urban-to-rural-ratio':uni_urban/uni_rural
    # }

    # useful_data.append(item)

In [6]:
tri_df=pd.DataFrame(tri_list)
bi_df=pd.DataFrame(bi_list)
uni_df=pd.DataFrame(uni_list)

In [7]:
tri_df.to_csv('outputs/geography-india-c.csv',index=False)
bi_df.to_csv('outputs/geography-india-b.csv',index=False)
uni_df.to_csv('outputs/geography-india-a.csv',index=False)

## how am I obtaining p-value?

- Here `the null hypothesis` states that the means of the two populations are the same(meaning -- the ratios not quite different between urban and rural)
- `The alternate hypothesis` states that the means of the two populations are not the same(here it means -- the ratios significantly different between urban and rural)
- So a test is needed to decided this!
- I am doing a simple t-test in particular I am using welch's t-test that is used when have unequal variances for samples, rather than student's t-test that is used when variances of samples is equal.
- basic thing is that we have continous features(here `ratios`) and their variances are uneuqal.
    - Here's the reason why:
        - Let X = vector containing trilingual(3+) ratio, bilingual(excat 2) ratio, monolingual(only one) ratio
        - Y = Vector containing ratio of male:female population or urban:rural population
        Y will contain 1 value repeated thrice.
        - Since all the values of Y are same, Var[Y]=0
        - But the 3 values in X will be different and hence Var[X] won't be zero
        - Thus, Var[X] != Var[Y]
- To perform Welch's test we can use `scipy.stats` module; in particular we can use either of the following options[I have used first -- both gives same answer]
    - use `ttest_1samp` function with `popmean` being background ratio of urban to rural
    - or use `ttest_ind` func with `equal_var` being set to `False` with `X` and `Y` vectors

In [8]:
tri_df

Unnamed: 0,state-code,urban-percentage,rural-percentage,p-value
0,0,13.42,4.24,0.332407
1,1,24.39,13.82,0.568997
2,2,9.65,4.55,0.279581
3,3,34.61,24.39,0.523689
4,4,30.94,15.02,0.592084
5,5,3.84,1.0,0.263868
6,6,6.87,3.41,0.270791
7,7,8.26,1.31,0.39907
8,8,2.33,1.18,0.236601
9,9,2.96,0.82,0.217139


## observations
- For all three case:
    - for no state or ut the ratio is significantly(at 0.05 level) different since for all state/ut p-value is greater than 0.05 level. Even for 0.1 level we have no state/ut for which is ratio is signifantly different
    - so for all state/ut we accept the null hypothesis that is, the ratios are not different between urban and rural