In [40]:
import pandas as pd
import operator
import matplotlib.pyplot as plt
import matplotlib
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [41]:
contraceptive_data = pd.read_csv("contraceptive_method_dataset.csv", 
                                 encoding = "ISO-8859-1", engine='python')


# DATASET
The dataset is being assigned to X and y variable. X are all attributes besides children while the y is assigned for the attribute children. The y is split into a binary variable. 

In [42]:
#Format Classification Tree
X = contraceptive_data.drop('children', axis=1).copy()

In [43]:
X

Unnamed: 0,wife_age,wife_education,husband_education,wife_religion,wife_working,husband_occupation,standard_of_living_index,media_exposure,contraceptive_method_used
0,24,2,3,1,1,2,3,0,1
1,45,1,3,1,1,3,4,0,1
2,43,2,3,1,1,3,4,0,1
3,42,3,2,1,1,3,3,0,1
4,36,3,3,1,1,3,2,0,1
...,...,...,...,...,...,...,...,...,...
1468,33,4,4,1,0,2,4,0,3
1469,33,4,4,1,1,1,4,0,3
1470,39,3,3,1,0,1,4,0,3
1471,33,3,3,1,0,2,2,0,3


In [44]:
X.dtypes

wife_age                     int64
wife_education               int64
husband_education            int64
wife_religion                int64
wife_working                 int64
husband_occupation           int64
standard_of_living_index     int64
media_exposure               int64
contraceptive_method_used    int64
dtype: object

_Notes_
- There are 9 attributes used in variable X.


# Creating Binary Response Variable. 
Splitting the children into binary variable. A woman having 0 to 2 children has index 0 while a woman having more than 2 children has index 1. The reason is a woman having 2 or less children will have negative or stabilizing effect on the population while a woman with more than 2 children will have a positive effect on the population because a couple will replace themselves with two offspring.

In [45]:
contraceptive_data['predictor_population']= pd.cut(contraceptive_data['children'],
                                       [-1,2,16], labels=[0,1])
contraceptive_data['predictor_population_i']= contraceptive_data['predictor_population'].astype(int)
y = contraceptive_data['predictor_population_i']
print(contraceptive_data['predictor_population_i'].value_counts())

1    824
0    649
Name: predictor_population_i, dtype: int64


In [46]:
#Computing the Ratio
print("Ratio for more than 2 children: ", 824/(649+824))
print("Ratio for equal to less than 2 children: ", 649/(649+824))

Ratio for more than 2 children:  0.5594025797691785
Ratio for equal to less than 2 children:  0.44059742023082143


_Note_
- In this survey, there are 824 people who has more than 2 children which is 55.94% of the population. There are 649 who has 2 children or less which is 44.059%. The y variable is balance because the difference is not 90% to 10%.

# Using Statsmodel To Perform Logistic Regression
Logit function is used in this process. A model is made and is fitted.

In [47]:
import statsmodels.formula.api as smf

In [48]:
contraceptive_data.columns

Index(['wife_age', 'wife_education', 'husband_education', 'children',
       'wife_religion', 'wife_working', 'husband_occupation',
       'standard_of_living_index', 'media_exposure',
       'contraceptive_method_used', 'predictor_population',
       'predictor_population_i'],
      dtype='object')

In [49]:
model = smf.logit('predictor_population_i ~ wife_age + wife_education + husband_education + wife_religion +' 
                  'wife_working + husband_occupation + standard_of_living_index + media_exposure +'
                  'contraceptive_method_used', data= contraceptive_data)


In [50]:
results = model.fit()
print(results.summary())

Optimization terminated successfully.
         Current function value: 0.519119
         Iterations 6
                             Logit Regression Results                             
Dep. Variable:     predictor_population_i   No. Observations:                 1473
Model:                              Logit   Df Residuals:                     1463
Method:                               MLE   Df Model:                            9
Date:                    Sat, 28 Nov 2020   Pseudo R-squ.:                  0.2433
Time:                            19:15:11   Log-Likelihood:                -764.66
converged:                           True   LL-Null:                       -1010.6
Covariance Type:                nonrobust   LLR p-value:                3.201e-100
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept                    -6.4243      0.66

_Note_
- Based on this summary, the p-value for intercept, wife's age, wife religion, wife working, and contraceptive method used are statistically significant because the p-value should be lesser than .05. It is interpreted that the intercept is -6.4243. For one unit increase in wife age, the odds increases by a multiplicative factor of 0.1661. For one unit increase in wife religion, the odds increases by a multiplicative factor of 0.3840. For one unit increase in wife working, the odds increases by a multiplicative factor of 0.4585. For one unit increase in contraceptive method used, the odds increase by a multiplicative facor of 0.7788.

- The p-value for wife's education, husband's education, husband's occupation, and standard of living index are statistically insignificant because the p-value is bigger than 0.5 which makes it statistically insignificant or not sure. Its effect is not by random chance.


# Odd Ratios
Odd Ratios is needed to interpret logistic model. First, the results have to exponentiated. The values are interpreted as odds ratios. Its analogy is closes to "how many times likely" the outcome will be.

In [51]:
odds_ratios = np.exp(results.params)
print(odds_ratios)

Intercept                    0.001622
wife_age                     1.180671
wife_education               0.858258
husband_education            0.910276
wife_religion                1.468147
wife_working                 1.581756
husband_occupation           1.065431
standard_of_living_index     0.938274
media_exposure               1.454062
contraceptive_method_used    2.178952
dtype: float64


_Note_
- For every one unit increase in wife working, the odds of having more than 2 children increased by 1.58 times. While the every one unit increase in wife's education, the odds of having more than 2 children will decrease by .86 times being greater than 2 
