## Need to be running a python  version that works with outlier_utils

[Resource](https://www.alfredo.motta.name/create-isolated-jupyter-ipython-kernels-with-pyenv-and-virtualenv/) for having multiple versions of python + Jupyter. The colab version is 3.6.9 but not an option in pyenv so went with 3.6.8

In [1]:
from platform import python_version

print(python_version())

3.6.8


## Update path_j based on user & install packages

In [2]:
#google drive authorize AND googlesheets4 authorize-- must create a project under google API and then download credentials
#see authentication section in the following article
#https://medium.com/@bretcameron/how-to-use-the-google-drive-api-with-javascript-57a6cc9e5262
#Note: need to open a service account and extract .json credentials in the API console, approve Google Drive API, then share the service account email address with the sheet of interest
path_j='/Users/owner/Desktop/Berkeley_Work/Projects/2017_San_Diego/June 2018_Data_Analysis/access-via-jupiter-notebook-d18b3ab0993b.json' #lauren's


#need  to do the following  in the activated virtual environment
# !pip install gspread
# !pip install oauth2client
# !pip install outliers
# !pip install plotnine
# !pip install scikit-misc
# !pip install outlier_utils
# !pip install pandas
# !pip  install numpy
# !pip install sklearn

In [3]:
import sys
import os
from os import path
from datetime import date
import math
import warnings
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from plotnine import *
from scipy import stats
from scipy.stats import linregress
import gspread
from mizani.formatters import scientific_format

%matplotlib inline

#Set up Google drive access and sheets manipulation
import gspread
from oauth2client.service_account import ServiceAccountCredentials
scope = ['https://spreadsheets.google.com/feeds']
credentials = ServiceAccountCredentials.from_json_keyfile_name(path_j, scope)

#Authorize credentials and open master sampling table using its URL
gc = gspread.authorize(credentials)
# routine= gc.open_by_url('https://docs.google.com/spreadsheets/d/163xXZnJjh0LdryxrDhuYxgzHB_IY7Jaew3DpYzoNs9Q/edit#gid=1255279898')
# qpcr=gc.open_by_url('https://docs.google.com/spreadsheets/d/1TzZcB2p55BuG5FbXAA44EFoe8BdRapGyDaVR4hx9NXU/edit#gid=1689469809')
import skmisc
import outliers

# read in custom modules
from read_gsheets import * 
from reprocess_qpcr import *
from calculations import *
from qa_qc import *

#urls
samples_url = 'https://docs.google.com/spreadsheets/d/163xXZnJjh0LdryxrDhuYxgzHB_IY7Jaew3DpYzoNs9Q'
qpcr_url = 'https://docs.google.com/spreadsheets/d/1TzZcB2p55BuG5FbXAA44EFoe8BdRapGyDaVR4hx9NXU'

#sheets
rna_tab = 'sample_inventory'
ww_tab='site_lookup'
facility_lookup='site_lookup'
qpcr_results_tab = 'QuantStudio_raw_data'
qpcr_plates_tab = 'Plate_info'

In [4]:
# read in data
sample_data = read_sample_data(gc, samples_url, rna_tab, facility_lookup)
qpcr_raw = read_qpcr_data(gc, qpcr_url, qpcr_results_tab, qpcr_plates_tab)
qpcr_processed, std_curve_df, raw_outliers_flagged_df = process_qpcr_raw(qpcr_raw, 'grubbs_only')

# merge with sample data and remove NTCs and Xeno from averaged data
qpcr_averaged = qpcr_processed.merge(sample_data, how='left', left_on='Sample', right_on='sample_id')
qpcr_averaged = qpcr_averaged[(qpcr_averaged.Sample != 'NTC') &
                              (qpcr_averaged.Target != 'Xeno')]
# calculations
qpcr_averaged['gc_per_L'] = calculate_gc_per_l(qpcr_averaged) # get gc/L
qpcr_normd = normalize_to_pmmov(qpcr_averaged)
#qpcr_normd = get_GFP_recovery(qpcr_normd) # this converts dates to numeric somehow

## Filter out research batches
qpcr_normd = qpcr_normd[~(qpcr_normd['batch'].str.contains("B", na=True)) &
                        ~(qpcr_normd['batch'].str.contains("V", na=True))].copy()

# make 'PBS_result' column and remove extraction controls from main dataset
qpcr_normd = get_extraction_control(qpcr_normd)

In [5]:
xeno_inhib_full, xeno_control=xeno_inhibition_test(raw_outliers_flagged_df)

xeno_inhib=xeno_inhib_full.merge(sample_data, left_on='Sample', right_on='sample_id',  how='left').copy()
inhibited=xeno_inhib[xeno_inhib.dCt>1].Sample.unique()
not_inhibited=xeno_inhib[xeno_inhib.dCt<=1].Sample.unique()

qpcr_normd["is_inhibited"]='unknown'
qpcr_normd.loc[qpcr_normd.Sample.isin(inhibited),"is_inhibited"]="Yes"
qpcr_normd.loc[qpcr_normd.Sample.isin(not_inhibited),"is_inhibited"]="No"

In [7]:
qpcr_normd.head()

Unnamed: 0,plate_id,Sample,Sample_plate,Target,Task,inhibition_testing,template_volume,Q_init_mean,Q_init_std,Q_init_CoV,...,interceptor_population_served,interceptor_general_flow_MGD,gc_per_ul_input,gc_per_L,pmmov_mean,mean_normalized_to_pmmov,log10mean_normalized_to_log10pmmov,log10_mean_normalized_to_pmmov,PBS_result,is_inhibited
0,30,A_INF_072820_2,A_INF_072820_2+30,N1,Unknown,N,5.0,6.441708,2.855597,0.721681,...,,,0.478186,2390.929359,31864.842448,7.5e-05,0.084064,-4.124745,,unknown
1,30,B_SD2_072920_2,B_SD2_072920_2+30,N1,Unknown,N,5.0,18.91941,1.16259,0.126887,...,,,3.189679,15948.39586,56231.193638,0.000284,0.253205,-3.54726,,unknown
2,30,B_SQ_072820_2,B_SQ_072820_2+30,N1,Unknown,N,5.0,555.9695,1.097319,0.076174,...,,,100.93739,504686.951656,101672.813963,0.004964,0.539827,-2.304183,,unknown
3,30,B_SR_072820_2,B_SR_072820_2+30,N1,Unknown,N,5.0,619.2611,1.174815,0.125642,...,,,110.120328,550601.642174,194489.938518,0.002831,0.518225,-2.54806,,unknown
4,30,C_ADA_072820_1,C_ADA_072820_1+30,N1,Unknown,N,5.0,,,,...,,,,,260.008655,,,,,unknown
