## import modules

In [1]:
import pandas as pd
from scipy import stats
import numpy as np

## census data processing

In [2]:
# for population data
cols=['State', 'Level', 'Name', 'TRU', 'No_HH', 'TOT_P', 'TOT_M', 'TOT_F']
census=pd.read_excel('datasets/census.xlsx',engine='openpyxl',usecols=cols)
censusIndia=census.iloc[0,:]
census=census.loc[(census.Level=='STATE') & (census.TRU=='Total')]
census=census.append(censusIndia,ignore_index=True)
census.sort_values(by=['State'],axis=0,inplace=True)
census.reset_index(drop=True,inplace=True)

## male-to-female

In [4]:
c18=pd.read_excel('datasets/C-18.xlsx',engine='openpyxl',skiprows=6,header=None)

In [5]:
STATE_NAMES=[]
for state in c18.iloc[:,2].values:
    if not (state in STATE_NAMES):
        STATE_NAMES.append(state)

In [18]:
# useful_data=[]
tri_list=[]
bi_list=[]
uni_list=[]
for i,state in enumerate(STATE_NAMES):
    
    # here i is the state code
    male_pop=census[(census['State']==i) & (census['TRU']=='Total')]['TOT_M'].values[0]
    female_pop=census[(census['State']==i) & (census['TRU']=='Total')]['TOT_F'].values[0]
    
    # tri
    tri_male=c18[(c18.iloc[:,0]==i) & (c18.iloc[:,4]=='Total') & (c18.iloc[:,3]=='Total')].iloc[0,9]
    tri_female=c18[(c18.iloc[:,0]==i) & (c18.iloc[:,4]=='Total') & (c18.iloc[:,3]=='Total')].iloc[0,10]

    #bi
    bi_male=c18[(c18.iloc[:,0]==i) & (c18.iloc[:,4]=='Total') & (c18.iloc[:,3]=='Total')].iloc[0,6] - tri_male
    bi_female=c18[(c18.iloc[:,0]==i) & (c18.iloc[:,4]=='Total') & (c18.iloc[:,3]=='Total')].iloc[0,7] - tri_female

    #uni
    uni_male=male_pop-bi_male-tri_male
    uni_female=female_pop-bi_female-tri_female
    
    p_value=stats.ttest_1samp([tri_male/tri_female,bi_male/bi_female,uni_male/uni_female],[male_pop/female_pop]).pvalue[0]

    # or we could perform simple ttest_ind with variance being unequal like:
    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html
    # p_value1=stats.ttest_ind([male_pop/female_pop]*3,[tri_male/tri_female,bi_male/bi_female,uni_male/uni_female],equal_var=False).pvalue
    # item={
    #     'state-code':i,
    #     'state-name':state,
    #     'p-value-1samp':p_value,
    #     'male-to-female':male_pop/female_pop,
    #     'tri-male-to-female-ratio':tri_male/tri_female,
    #     'bi-male-to-female-ratio':bi_male/bi_female,
    #     'uni-male-to-female-ratio':uni_male/uni_female
    # }
    tri_item={
        'state-code':i,
        'male-percentage':round(100*tri_male/male_pop,2),
        'female-percentage':round(100*tri_female/female_pop,2),
        'p-value':p_value
    }

    bi_item={
        'state-code':i,
        'male-percentage':round(100*bi_male/male_pop,2),
        'female-percentage':round(100*bi_female/female_pop,2),
        'p-value':p_value
    }

    uni_item={
        'state-code':i,
        'male-percentage':round(100*uni_male/male_pop,2),
        'female-percentage':round(100*uni_female/female_pop,2),
        'p-value':p_value
    }
    tri_list.append(tri_item)
    bi_list.append(bi_item)
    uni_list.append(uni_item)
    # useful_data.append(item)

In [None]:
tri_df=pd.DataFrame(tri_list)
bi_df=pd.DataFrame(bi_list)
uni_df=pd.DataFrame(uni_list)

In [None]:
tri_df.to_csv('outputs/gender-india-c.csv',index=False)
bi_df.to_csv('outputs/gender-india-b.csv',index=False)
uni_df.to_csv('outputs/gender-india-a.csv',index=False)

### independence t-test of 2-vectors

- 1 categorical feature -> 1 sample proportion test
- 2 categorical features -> chi-square test
- 1 continuous feature -> 1 sample t-test
- 2 continuous features + equal variance -> 2 sample student's  t-test
- `2 continuous features + unequal variance -> welsch's t-test`
- 1 categorical feature + 1 continuous feature -> ANOVA test

here, we are comparing a continuous feature : (specifically ratios) with unequal variances, so perform welsch's t-test

#### result
- for no state or ut the ratio is significantly different