# Spiroplasma vs Fly Sampling Locations

These are the analyses and results regarding the spiroplasma assays and any putative biased sex distribution etc.

__<font color="red">ATTENTION:</font> Click here to skip directly to the [Results](#Final-Results)__:
- [Methods Overview](#Overview-of-what-was-done:)
- [Sex Bias Results](#Sex-Bias-Results:)
    - [Conclusions](#Sex-Bias-Conclusions:)
- [Geographical Bias Results](#Geographical-Bias-Results:)
    - [Conclusions](#Geographical-Bias-Conclusions:)
    
 ----

In [1]:
%matplotlib inline

from StringIO import StringIO  # got moved to io in python3.

import requests

import pandas as pd
pd.set_option("max_rows", 100)
pd.set_option("max_columns", 100)

import patsy
import numpy as np
import scipy as sp
import statsmodels as smd
import statsmodels.api as sm
from statsmodels.formula.api import logit, glm

from scipy.stats import fisher_exact
from scipy.stats import f_oneway

# Load spreadsheets

In [4]:
# spreadsheet data
table_all = pd.read_csv("/home/gus/MEGAsync/zim/main/Yale/Projects/Spiroplasma/related_files/2015_11_30__Spiroplasma_tested_samples_REFINED.csv")

## Run script to load main database info into this notebook

In [3]:
# %run /home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/scripts/gff_pandas_database.py

In [4]:
# def recode_dfp(df):
#     df['Collection Year'] = pd.DatetimeIndex(df.Date).year
#     df = df.rename(columns={"Village": "Location Code", "Fly_Number": "Fly Number"})
#     return df[["Location Code","Collection Year","Fly Number","Sex"]]

In [5]:
# d = recode_dfp(dfp.copy())
# d.head()

## Recover sex data into `table_gsh` by crossref with `dfp`

In [6]:
# # recover sex data into table_gsh by joining with dfp
# table_gsh = pd.merge(left=table_gsh.copy(), right=recode_dfp(dfp.copy()), 
#                      how='left', 
#                      on=["Location Code","Collection Year","Fly Number"], 
#                      left_on=None, right_on=None, 
#                      left_index=False, right_index=False, 
#                      sort=False, suffixes=('_x', '_y'), copy=True)

In [7]:
# table_gsh.head()

In [8]:
# table_all = pd.concat([table_gsh.dropna(),
#                        table_xls[["Location Code","Collection Year","Fly Number","Sex","Spiroplasma"]].dropna()])
# table_all.head()

-----------------

# Group data by number of flies belonging to any combination of +/- vs M/F

In [5]:
table_all.head()

Unnamed: 0,Box,Location Code,General location,Numbers on Vial,Fly Number,Month,Sex,DNA made,Spiroplasma
0,RP3,AIN,NW,ARI4158,158,Jun-14,F,Yrp,False
1,RP3,AIN,NW,ARI4159,159,Jun-14,F,Yrp,True
2,RP3,AIN,NW,ARI4161,161,Jun-14,F,Yrp,True
3,RP3,AIN,NW,ARI4162,162,Jun-14,M,Yrp,False
4,RP3,AIN,NW,ARI4163,163,Jun-14,F,Yrp,True


In [6]:
table_all_pivot = table_all.pivot_table(values="Fly Number", 
                                        index=["Location Code"], 
                                        columns=["Spiroplasma","Sex"], 
                                        aggfunc=[len], 
                                        fill_value=0, margins=False, dropna=True)

table_all_pivot.columns = table_all_pivot.columns.droplevel() # removes useless 'len' top multilevel index
table_all_pivot

Spiroplasma,False,False,True,True
Sex,F,M,F,M
Location Code,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
AIN,10,7,12,5
AMI,12,10,0,1
BOL,9,6,2,1
CHU,9,8,1,0
DUK,9,10,18,9
GAN,16,4,9,3
KIL,12,3,2,0
LEA,7,5,2,0
NGO,12,5,0,0
OCA,6,1,0,0


# Hypotheses tests of locations and multiple testing corrections

In [7]:
def do_tests(df):
    
    locations = df.index.values
    
    tests = {}
    
    for loc in locations:
        
        locdf = df.loc[loc,:]
        contingency_table = locdf.reshape((2,2))
        
        odds_ratio, p_val = fisher_exact(contingency_table)
        
        tests[loc] = (odds_ratio, p_val)
        
    contingency_table_all = df.sum().reshape((2,2))
    odds_ratio_all, p_val_all = fisher_exact(contingency_table_all)
    
    tests['all'] = (odds_ratio_all, p_val_all)
    
    testsdf = pd.DataFrame(data=tests, index=["Odds ratio","pvals"], columns=tests.keys(), dtype=None, copy=False)
    return testsdf.T

def add_fdr(df):
    multitests =smd.stats.multitest.multipletests 
    
    # do the fdr correction
    reject_or_not,corrected_pval = multitests(pvals=df.pvals, alpha=0.05, method='fdr_bh')[:2]
    
    # add results to dataframe
    df["Reject the null?"],df["adjusted pvals"] = reject_or_not,corrected_pval

In [8]:
results = do_tests(table_all_pivot)
add_fdr(results)
results.sort()

  app.launch_new_instance()


Unnamed: 0,Odds ratio,pvals,Reject the null?,adjusted pvals
AIN,0.595238,0.720682,False,1
AMI,inf,0.478261,False,1
BOL,0.75,1.0,False,1
CHU,0.0,1.0,False,1
DUK,0.45,0.233335,False,1
GAN,1.333333,1.0,False,1
KIL,0.0,1.0,False,1
LEA,0.0,0.505495,False,1
NGO,,1.0,False,1
OCA,,1.0,False,1


# Is there a significant difference of Spiroplasma prevalence between populations?

- will NOT use one way ANOVA bc the data is categorical
- will use logit regression probably
- may run both `scipy` and `statsmodels` version out of curiousity and to double check results

## To make the easier to interpret going to pull in the GPS coords of the locations

- this will let me run the comparison vs location name (categorical) and location coords (numerical) 

In [10]:
location_gps = pd.read_csv('/home/gus/src/repos/git/field_data/locations/gps/villages/uganda_villages_gps.csv',
                           sep=','
                          )

In [13]:
location_gps.head()

Unnamed: 0,Location,Latitude,Longitude
0,ABO,2.466775,32.56499
1,ACA,2.27008,32.52053
2,AG,2.413985,32.59915
3,AIN,3.304225,31.11941
4,AKA,2.37258,32.67495


In [14]:
table_all_gps = pd.merge(left=table_all, 
                         right=location_gps, 
                         how='left', 
                         on=None, 
                         left_on="Location Code", right_on="Location", 
                         left_index=False, right_index=False, 
                         sort=False, suffixes=('_x', '_y'), copy=True).drop(labels=["Location"],axis=1)
table_all_gps.head()

Unnamed: 0,Box,Location Code,General location,Numbers on Vial,Fly Number,Month,Sex,DNA made,Spiroplasma,Latitude,Longitude
0,RP3,AIN,NW,ARI4158,158,Jun-14,F,Yrp,False,3.304225,31.11941
1,RP3,AIN,NW,ARI4159,159,Jun-14,F,Yrp,True,3.304225,31.11941
2,RP3,AIN,NW,ARI4161,161,Jun-14,F,Yrp,True,3.304225,31.11941
3,RP3,AIN,NW,ARI4162,162,Jun-14,M,Yrp,False,3.304225,31.11941
4,RP3,AIN,NW,ARI4163,163,Jun-14,F,Yrp,True,3.304225,31.11941


In [15]:
# recode Spiroplasma as 0/1 vs True/False
table_all_gps["Spiroplasma"] = table_all_gps.Spiroplasma.map({True:1,False:0})
table_all_gps.head()

Unnamed: 0,Box,Location Code,General location,Numbers on Vial,Fly Number,Month,Sex,DNA made,Spiroplasma,Latitude,Longitude
0,RP3,AIN,NW,ARI4158,158,Jun-14,F,Yrp,0,3.304225,31.11941
1,RP3,AIN,NW,ARI4159,159,Jun-14,F,Yrp,1,3.304225,31.11941
2,RP3,AIN,NW,ARI4161,161,Jun-14,F,Yrp,1,3.304225,31.11941
3,RP3,AIN,NW,ARI4162,162,Jun-14,M,Yrp,0,3.304225,31.11941
4,RP3,AIN,NW,ARI4163,163,Jun-14,F,Yrp,1,3.304225,31.11941


## Run the logistic regression model

In [16]:
logit_gps = logit('Spiroplasma ~ Longitude + Latitude',
                data=table_all_gps,
               ).fit()
logit_gps.summary()

Optimization terminated successfully.
         Current function value: 0.470589
         Iterations 7


0,1,2,3
Dep. Variable:,Spiroplasma,No. Observations:,437.0
Model:,Logit,Df Residuals:,434.0
Method:,MLE,Df Model:,2.0
Date:,"Mon, 30 Nov 2015",Pseudo R-squ.:,0.1466
Time:,10:19:55,Log-Likelihood:,-205.65
converged:,True,LL-Null:,-240.96
,,LLR p-value:,4.611e-16

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
Intercept,39.5031,7.603,5.196,0.000,24.601 54.405
Longitude,-1.3439,0.227,-5.932,0.000,-1.788 -0.900
Latitude,0.6648,0.365,1.823,0.068,-0.050 1.380


In [17]:
logit_gps.get_margeff().summary()

0,1
Dep. Variable:,Spiroplasma
Method:,dydx
At:,overall

Unnamed: 0,dy/dx,std err,z,P>|z|,[95.0% Conf. Int.]
Longitude,-0.2093,0.03,-6.904,0.0,-0.269 -0.150
Latitude,0.1035,0.057,1.825,0.068,-0.008 0.215


-------------------

# Final Results

### Overview of what was done:

#### Sex Bias
1. Tables were cleaned using [OpenRefine](http://openrefine.org/) to standardize things like
    - "positive"
    - "Positive"
    - "yes"
    - "Yes"

2. Tables were read into this notebook and the table without Sex information was cross-referenced with the original database entries to recover any sex information availible.
3. Tables were further cleaned to remove any columns that were not needed to unambiguously identify each fly or represent the Sex/Spiroplasma data.
    - Flies that had missing data or needed to be re-run were removed.
4. Data were grouped by the number of flies belonging to any combination of __Spiroplasma results__ (pos/neg) vs __Sex__ (M/F).
    - This table represents contingency table information for each location.
5. The contingency tables for each location as well as the summed contingency table for all data combined, were used to calculate [Fisher's exact test of independence](http://www.biostathandbook.com/fishers.html) to yield odds ratios and initial p-values for all sub-sets of data examined.
6. Multiple testing correction (Benjamini-Hochberg) was applied and adjusted p-values along with rejection of null hypothese recommentations appened to the table.

#### Geographical Location Bias

1. Started with the same data as used above but brought in the representative latitude and longitude coordinates of each location to use as the independent variables.
2. Logistic regression was run with the model: `infection ~ lat + long`:
    - _independent variables:_ `lat`, `long`
    - _dependent variables:_ `infection`

---------------------

# Sex Bias Results:

## Contingency Data:

In [18]:
table_all_pivot

Spiroplasma,False,False,True,True
Sex,F,M,F,M
Location Code,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
AIN,10,7,12,5
AMI,12,10,0,1
BOL,9,6,2,1
CHU,9,8,1,0
DUK,9,10,18,9
GAN,16,4,9,3
KIL,12,3,2,0
LEA,7,5,2,0
NGO,12,5,0,0
OCA,6,1,0,0


## Significance tests:

In [19]:
results.sort()

  if __name__ == '__main__':


Unnamed: 0,Odds ratio,pvals,Reject the null?,adjusted pvals
AIN,0.595238,0.720682,False,1
AMI,inf,0.478261,False,1
BOL,0.75,1.0,False,1
CHU,0.0,1.0,False,1
DUK,0.45,0.233335,False,1
GAN,1.333333,1.0,False,1
KIL,0.0,1.0,False,1
LEA,0.0,0.505495,False,1
NGO,,1.0,False,1
OCA,,1.0,False,1


# Sex Bias Conclusions:

There is <b><font color="red">no sex bias detected</font></b> for probability of being infected with Spiroplasma based on these data at the location level nor over all.

 ----

# Geographical Bias Results:

In [20]:
logit_gps.summary()

0,1,2,3
Dep. Variable:,Spiroplasma,No. Observations:,437.0
Model:,Logit,Df Residuals:,434.0
Method:,MLE,Df Model:,2.0
Date:,"Mon, 30 Nov 2015",Pseudo R-squ.:,0.1466
Time:,10:20:00,Log-Likelihood:,-205.65
converged:,True,LL-Null:,-240.96
,,LLR p-value:,4.611e-16

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
Intercept,39.5031,7.603,5.196,0.000,24.601 54.405
Longitude,-1.3439,0.227,-5.932,0.000,-1.788 -0.900
Latitude,0.6648,0.365,1.823,0.068,-0.050 1.380


In [21]:
logit_gps.get_margeff().summary()

0,1
Dep. Variable:,Spiroplasma
Method:,dydx
At:,overall

Unnamed: 0,dy/dx,std err,z,P>|z|,[95.0% Conf. Int.]
Longitude,-0.2093,0.03,-6.904,0.0,-0.269 -0.150
Latitude,0.1035,0.057,1.825,0.068,-0.008 0.215


# Geographical Bias Conclusions:

There <b><font color="green">is geographical bias detected</font></b> for probability of being infected with Spiroplasma based on these data.

- The overall significance of the analysis is $p_{_{LLR}} = 4.611\times10^{-16}$ which is easily significant and 10 orders of magnitude more significant than the last run.
- Particularly, the marginal effects value for __Longitude__ (-0.1670, $p < 10^{-4}$) suggests that as __Longitude__ increases, the probability of infection __DE__-creases. _(the farther east you go the LESS infections you should expect)_
- The effect of __Latitude__ is predicted to be that its increase would result in less pronounced IN-crease of infection probability. _(the farther north you go, the MORE infections you should expect)_  <font color="coral"> _While this prediction was __not__ significant in the current data, it is now approaching an alpha cut-off of the standard 0.05 (0.068) and is an order of magnitude more significant than before.  If the basic pattern continues, I expect the north/south relationship to become significant with more data.  __However__, the east/west relationship still seems to be stronger in both effect size and significance._</font> 

## Comparison with results from last time:

- The marginal effect of __Longitude__ rebounded back to around -0.2 or nearly the same as the first time:
    - `[original]` -0.2139 ($p < 10^{-4}$)
    - `[last time]` -0.1670 ($p < 10^{-4}$)
    - `[this time]` -0.2093 ($p < 10^{-4}$)

- The marginal effect of __Latitude__ :
    - `[original]` 0.0701 ($p = 0.505 $)
    - `[last time]` -0.0171($p = 0.728 $)
    - `[this time]` 0.1035 ($p = 0.068 $)

- The Latitude effect returned to the same direction as the original analysis and the p-value also became an order of magnitude more significant.  It is now approaching an alpha cut-off of the standard 0.05 but does not quite reach it. 

----

# Infection rate per location

In [22]:
table_all

Unnamed: 0,Box,Location Code,General location,Numbers on Vial,Fly Number,Month,Sex,DNA made,Spiroplasma
0,RP3,AIN,NW,ARI4158,158,Jun-14,F,Yrp,False
1,RP3,AIN,NW,ARI4159,159,Jun-14,F,Yrp,True
2,RP3,AIN,NW,ARI4161,161,Jun-14,F,Yrp,True
3,RP3,AIN,NW,ARI4162,162,Jun-14,M,Yrp,False
4,RP3,AIN,NW,ARI4163,163,Jun-14,F,Yrp,True
5,RP3,AIN,NW,ARI4164,164,Jun-14,F,Yrp,True
6,RP3,AIN,NW,ARI4165,165,Jun-14,F,Yrp,False
7,RP3,AIN,NW,ARI4166,166,Jun-14,F,Yrp,False
8,RP3,AIN,NW,ARI4167,167,Jun-14,M,Yrp,False
9,RP3,AIN,NW,ARI4169,169,Jun-14,M,Yrp,False


In [23]:
infection_rate_by_location = pd.crosstab(index=table_all['Location Code'], columns=table_all['Spiroplasma'])

In [24]:
infection_rate_by_location.head()

Spiroplasma,False,True
Location Code,Unnamed: 1_level_1,Unnamed: 2_level_1
AIN,17,17
AJ,12,3
AMI,22,1
BOL,15,3
CHU,17,1


In [25]:
infection_rate_by_location = infection_rate_by_location.reset_index().merge(right=location_gps,
                                               how='left', 
                                               left_on="Location Code", right_on="Location", 
                                               left_index=False, right_index=False)


In [26]:
infection_rate_by_location

Spiroplasma,Location Code,False,True,Location,Latitude,Longitude
0,AIN,17,17,AIN,3.304225,31.11941
1,AJ,12,3,,,
2,AMI,22,1,AMI,1.92483,33.156138
3,BOL,15,3,BOL,3.2934,32.78246
4,CHU,17,1,CHU,2.606845,32.93758
5,DUK,19,27,DUK,3.2668,31.134205
6,GAN,20,12,GAN,3.252455,31.121625
7,GOR,16,7,GOR,3.26606,32.208055
8,KIL,16,2,KIL,2.746835,32.951615
9,LEA,12,2,LEA,3.59254,31.60702


In [27]:
infection_rate_by_location_path = "/home/gus/MEGAsync/zim/main/Yale/Projects/Spiroplasma/related_files/2015_11_04__spiroplasma_rate_by_location.csv"

infection_rate_by_location[["Location", False,True,"Latitude","Longitude"]].to_csv(path_or_buf=infection_rate_by_location_path,
                                                                                 sep=',',
                                                                                 index=False)