In [118]:
%matplotlib inline

from StringIO import StringIO  # got moved to io in python3.

import requests

import seaborn
import pandas as pd
pd.set_option("max_rows", 100)
pd.set_option("max_columns", 100)

import numpy as np
import scipy as sp
import statsmodels as smd

from scipy.stats import fisher_exact

from spartan.utils import spandas as spd

In [42]:
# set figure characteristics

# size
sns.set_context("poster")

# Grid style
sns.set_style("whitegrid")

# colors
sns.set_palette(sns.hls_palette(n_colors=5, h=0.59, l=0.4, s=0.75))

# Helper functions

In [2]:
def gsheet_to_dataframe(key):
    url = 'https://docs.google.com/spreadsheet/ccc?key={key}&output=csv'.format(key=key)
    r = requests.get(url)
    
    data = r.content

    return pd.read_csv(StringIO(data))


# Load spreadsheets

In [3]:
# get openrefined google spreadsheet data
table_gsh = pd.read_csv("/home/gus/Documents/YalePostDoc/project_stuff/G_pallidipes_kenya/Spiroplasma/MF_Spiro_gsh.csv")
table_gsh.head()

Unnamed: 0,Location Code,Collection Year,Fly Number,Spiroplasma
0,CHU,2014,4,True
1,CHU,2014,6,False
2,CHU,2014,7,False
3,CHU,2014,10,False
4,CHU,2014,31,False


In [4]:
# Get openrefined excel file data
table_xls = pd.read_csv("/home/gus/Documents/YalePostDoc/project_stuff/G_pallidipes_kenya/Spiroplasma/MF_Spiro_xls.csv")
table_xls.head()

Unnamed: 0,Box,Location Code,Fly Number,Collection Year,Sex,Wolbachia,Spiroplasma
0,RP4,DUK,16,2014,F,False,
1,RP 1,GAN,123,2014,F,False,
2,RP2,GAN,251,2014,F,False,
3,RP2,GAN,261,2014,F,False,
4,RP 1,ORB,71,2014,F,True,


In [5]:
# Run script to deposit a conglomerate datafame of Gfus file data into notebook
%run /home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/scripts/gff_pandas_database.py

Main dataframe is dfp: and is only Gff.


In [6]:
dfp.head()

Unnamed: 0,Comment,Date,Dead,Fly_Number,Hunger_stage,Kept_in,Sex,Species,Teneral,Trap_No,Village,Wing_fray,infection_state,midgut,prob,sal_gland,workbook,worksheet
0,,2014-07-22,False,1,2,EtOH,F,Gff,False,1,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
1,,2014-07-22,False,2,3,EtOH,F,Gff,False,1,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
2,,2014-07-22,False,3,3,EtOH,M,Gff,False,1,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
3,,2014-07-22,False,4,3,EtOH,M,Gff,False,1,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya
4,,2014-07-22,False,5,3,EtOH,M,Gff,False,1,UWA,2,False,False,False,False,2014_spring_summer_from_rob.xlsx,Dissection Data-Nwoya


In [7]:
pd.DatetimeIndex(dfp.Date.head()).year

array([2014, 2014, 2014, 2014, 2014], dtype=int32)

In [8]:
def recode_dfp(df):
    df['Collection Year'] = pd.DatetimeIndex(df.Date).year
    df = df.rename(columns={"Village": "Location Code", "Fly_Number": "Fly Number"})
    return df[["Location Code","Collection Year","Fly Number","Sex"]]

In [9]:
d = recode_dfp(dfp.copy())
d.head()

Unnamed: 0,Location Code,Collection Year,Fly Number,Sex
0,UWA,2014,1,F
1,UWA,2014,2,F
2,UWA,2014,3,M
3,UWA,2014,4,M
4,UWA,2014,5,M


In [10]:
# recover sex data into table_gsh by joining with dfp
table_gsh = pd.merge(left=table_gsh.copy(), right=recode_dfp(dfp.copy()), 
                     how='left', 
                     on=["Location Code","Collection Year","Fly Number"], 
                     left_on=None, right_on=None, 
                     left_index=False, right_index=False, 
                     sort=False, suffixes=('_x', '_y'), copy=True)

In [11]:
table_gsh.head()

Unnamed: 0,Location Code,Collection Year,Fly Number,Spiroplasma,Sex
0,CHU,2014,4,True,F
1,CHU,2014,6,False,F
2,CHU,2014,7,False,M
3,CHU,2014,10,False,M
4,CHU,2014,31,False,M


In [12]:
table_all = pd.concat([table_gsh.dropna(),
                       table_xls[["Location Code","Collection Year","Fly Number","Sex","Spiroplasma"]].dropna()])
table_all.head()

Unnamed: 0,Collection Year,Fly Number,Location Code,Sex,Spiroplasma
0,2014,4,CHU,F,True
1,2014,6,CHU,F,False
2,2014,7,CHU,M,False
3,2014,10,CHU,M,False
4,2014,31,CHU,M,False


In [13]:
table_all.head()

Unnamed: 0,Collection Year,Fly Number,Location Code,Sex,Spiroplasma
0,2014,4,CHU,F,True
1,2014,6,CHU,F,False
2,2014,7,CHU,M,False
3,2014,10,CHU,M,False
4,2014,31,CHU,M,False


In [91]:
table_all_pivot = table_all.pivot_table(values="Fly Number", 
                                        index=["Location Code"], 
                                        columns=["Spiroplasma","Sex"], 
                                        aggfunc=[len], 
                                        fill_value=0, margins=False, dropna=True)

table_all_pivot.columns = table_all_pivot.columns.droplevel() # removes useless 'len' top multilevel index
table_all_pivot

Spiroplasma,False,False,True,True
Sex,F,M,F,M
Location Code,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
AMI,10,10,0,1
BOL,6,6,3,1
CHU,7,8,3,0
DUK,3,6,6,4
GAN,11,3,8,3
KIL,12,3,2,0
NGO,12,5,0,0
OCU,6,5,1,1
ORB,22,6,19,7
TUM,9,7,0,0


In [100]:
table_all_pivot.sum()

Spiroplasma  Sex
False        F      98
             M      59
True         F      42
             M      17
dtype: int64

In [108]:
total_mf_pn = pd.crosstab(table_all.Sex,table_all.Spiroplasma)
total_mf_pn

Spiroplasma,False,True
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,98,42
M,59,17


In [109]:
fisher_exact(total_mf_pn)

(0.67231638418079098, 0.2647962624658704)

# Dump sex info for Michelle

In [116]:
# table_gsh.to_clipboard()

# Hypotheses tests of single locations and FDR correction

In [123]:
x = table_gsh.groupby('Location Code')
x.groups.keys()

['AMI', 'AJ', 'NGO', 'BOL', 'CHU', 'KIL', 'OCU', 'TUM']

In [135]:
def cnt_tbl_agg(x):
    sum(x)

def do_tests(df):
    
    locations = df.groupby('Location Code')
    
    tests = {}
    
    for loc in locations.groups.keys():
        
        locdf = locations.get_group(loc)
        contingency_table = pd.crosstab(locdf.Sex, locdf.Spiroplasma)
        
        odds_ratio, p_val = fisher_exact(contingency_table)
        
        tests[loc] = (odds_ratio, p_val)
        
    contingency_table_all = pd.crosstab(df.Sex, df.Spiroplasma)    
    odds_ratio_all, p_val_all = fisher_exact(contingency_table_all)
    
    tests['all'] = (odds_ratio_all, p_val_all)
    
    testsdf = pd.DataFrame(data=tests, index=tests.keys(), columns=["odds_ratio","p_val"], dtype=None, copy=False)
    return testsdf
    

In [136]:
# pdb

In [137]:
r = do_tests(table_all)

AssertionError: arrays and names must have the same length

> [1;32m/home/gus/anaconda2/envs/stack2/lib/python2.7/site-packages/pandas/tools/pivot.py[0m(407)[0;36m_get_names[1;34m()[0m
[1;32m    406 [1;33m        [1;32mif[0m [0mlen[0m[1;33m([0m[0mnames[0m[1;33m)[0m [1;33m!=[0m [0mlen[0m[1;33m([0m[0marrs[0m[1;33m)[0m[1;33m:[0m[1;33m[0m[0m
[0m[1;32m--> 407 [1;33m            [1;32mraise[0m [0mAssertionError[0m[1;33m([0m[1;34m'arrays and names must have the same length'[0m[1;33m)[0m[1;33m[0m[0m
[0m[1;32m    408 [1;33m        [1;32mif[0m [1;32mnot[0m [0misinstance[0m[1;33m([0m[0mnames[0m[1;33m,[0m [0mlist[0m[1;33m)[0m[1;33m:[0m[1;33m[0m[0m
[0m
ipdb> u
> [1;32m/home/gus/anaconda2/envs/stack2/lib/python2.7/site-packages/pandas/tools/pivot.py[0m(376)[0;36mcrosstab[1;34m()[0m
[1;32m    375 [1;33m[1;33m[0m[0m
[0m[1;32m--> 376 [1;33m    [0mrownames[0m [1;33m=[0m [0m_get_names[0m[1;33m([0m[0mindex[0m[1;33m,[0m [0mrownames[0m[1;33m,[0m [0mprefix[0m[1;33m

In [None]:
pd.crosstab