In [1]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from scipy.stats import chisquare
import numpy

#Reading normalised data
data = pd.read_csv('normalized_ferret_data.csv')
data.sort_values("Gene", inplace = True) 

#Getting rid of duplicated genes

data.drop_duplicates(subset ="Gene", 
                     keep = False, inplace = True)

#Computing the raw average gene expression values for each item

nctrl_1_avr = (data['Series10_FerretNW_Ctl_d1_1'] + data['Series10_FerretNW_Ctl_d1_2'])/2
ncov_1_avr = (data['Series10_FerretNW_SARS.CoV.2_d1_1'] + data['Series10_FerretNW_SARS.CoV.2_d1_2'])/2

nctrl_3_avr = (data['Series11_FerretNW_Ctl_d3_1'] + data['Series11_FerretNW_Ctl_d3_2'])/2
ncov_3_avr = (data['Series11_FerretNW_SARS.CoV.2_d3_1'] + data['Series11_FerretNW_SARS.CoV.2_d3_2'])/2

tctrl_3_avr = (data['Series14_FerretTrachea_Ctl_d3_1'] + data['Series14_FerretTrachea_Ctl_d3_2'] + data['Series14_FerretTrachea_Ctl_d3_3'] + data['Series14_FerretTrachea_Ctl_d3_4'])/4
tcov_3_avr = (data['Series14_FerretTrachea_SARS.CoV.2_d3_1'] + data['Series14_FerretTrachea_SARS.CoV.2_d3_2'] + data['Series14_FerretTrachea_SARS.CoV.2_d3_3'] + data['Series14_FerretTrachea_SARS.CoV.2_d3_4'])/4
tiav_3_avr = (data['Series14_FerretTrachea_IAV_d3_1'] + data['Series14_FerretTrachea_IAV_d3_2'] + data['Series14_FerretTrachea_IAV_d3_3'] + data['Series14_FerretTrachea_IAV_d3_4'] + data['Series14_FerretTrachea_IAV_d3_5'] + data['Series14_FerretTrachea_IAV_d3_6'])/6

nctrl_7_avr = (data['Series12_FerretNW_Ctl_d7_1'] + data['Series12_FerretNW_Ctl_d7_2'])/2
ncov_7_avr = (data['Series12_FerretNW_SARS.CoV.2_d7_1'] + data['Series12_FerretNW_SARS.CoV.2_d7_2'])/2
niav_7_avr = (data['Series12_FerretNW_IAV_d7_1'] + data['Series12_FerretNW_IAV_d7_2'])/2

nctrl_14_avr = (data['Series13_FerretNW_Ctl_d14_1'] + data['Series13_FerretNW_Ctl_d14_2'])/2
ncov_14_avr = (data['Series13_FerretNW_SARS.CoV.2_d14_1'] + data['Series13_FerretNW_SARS.CoV.2_d14_2'])/2

def clean_data(data,control,covid,bar):
    drops=[]
    
    for i in data.index:
        if abs(control[i]-covid[i]) <= bar or abs(control[i]-covid[i])/(covid[i]+0.1) <=0.5 or control[i]==0 or covid[i]==0:
                drops += [i];
    cleaned_ctrl = control.drop(drops, axis = 0)
    cleaned_cov = covid.drop(drops,axis=0)
    return cleaned_ctrl, cleaned_cov

day1_ctrl, day1_cov = clean_data(data, nctrl_1_avr,ncov_1_avr,1)
day3_ctrl, day3_cov = clean_data(data,nctrl_3_avr,ncov_3_avr,1)
day7_ctrl, day7_cov = clean_data(data,nctrl_7_avr,ncov_7_avr,1)
day14_ctrl, day14_cov = clean_data(data,nctrl_14_avr,ncov_14_avr,1)

#Returns a list of relevant genes sorted by chi-square in descending order for each day

def sig_genes(ctrl,cov, day):
    chisquares = []
    for i in range(len(ctrl)):
        chisquares.append(chisquare(ctrl.values[i],cov.values[i])[0])
                          
    chi_incl_comp_df = pd.DataFrame([ctrl,cov],index = ['control_day'+str(day), 'covid_day'+str(day)]).T
    chi_incl_comp_df['chi'] = chisquares
    sorted_by_chi = chi_incl_comp_df.sort_values(by = ['chi'],ascending = False)
    return sorted_by_chi

#Sorted list of genes for each day

sorted_day1 = sig_genes(day1_ctrl, day1_cov,1)
sorted_day3 = sig_genes(day3_ctrl, day3_cov,3)
sorted_day7 = sig_genes(day7_ctrl, day7_cov,7)
sorted_day14 = sig_genes(day14_ctrl, day14_cov,14)

#Writing data for each day into csv files
sorted_day1.to_csv('day1_comparison.csv')
sorted_day3.to_csv('day3_comparison.csv')
sorted_day7.to_csv('day7_comparison.csv')
sorted_day14.to_csv('day14_comparison.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
#Returns genes that have significant chi-square values in one of the 4 days

def get_genes(day1,day3,day7,day14):
    genes = []
    for i in sorted_day1.index:
        if i in sorted_day3.index[:1000] or i in sorted_day7.index[:1000] or i in sorted_day14.index[0:1000]:
            genes.append(data.loc[i])
    return genes

#Putting together an overall file with relevant genes and chi-square values

overall = pd.concat([sorted_day1,sorted_day3,sorted_day7,sorted_day14],axis=1)

#Return the set of relevant genes that have high significant values
important_genes = pd.DataFrame(get_genes(sorted_day1,sorted_day3,sorted_day7,sorted_day14))

important_genes.to_csv('important_genes.csv')
overall.to_csv('overall.csv')

In [3]:
#Computing the average across all days for control and covid group

ctrl_avr_alldays = (nctrl_1_avr+ nctrl_1_avr+ nctrl_1_avr + nctrl_1_avr)/4
cov_avr_alldays = (ncov_1_avr+ ncov_1_avr+ ncov_1_avr + ncov_1_avr)/4

#Produces list of significantly different genes in desceding order wrt chisquare values
sorted_by_chi_all = sig_genes(cov_avr_alldays, ctrl_avr_alldays,'all')

  terms = (f_obs - f_exp)**2 / f_exp
  terms = (f_obs - f_exp)**2 / f_exp


In [5]:
#Dropping the NaN and inf Values

drops = []
simp = sorted_by_chi_all
for i in sorted_by_chi_all.index:
    if ( simp['control_dayall'][i]== 0) or (simp['covid_dayall'][i] == 0):
        drops += [i];

newsorted_dayall = sorted_by_chi_all.drop(drops,axis=0)

# Look for an intersection between 1) relevant genes found with a significant difference between covid and control
# for each day and, 2) relevant genes found with significant difference between covid and control on average across 4 days.

counter = 0
genes_2 = []
for i in newsorted_dayall.index[:50]:
    if i in important_genes.index and data.loc[i].Human_Gene == data.loc[i].Human_Gene:
        counter += 1
        genes_2.append(data.loc[i])
print('counter: '+str(counter))

selected_genes = pd.DataFrame(genes_2)

counter: 14


In [6]:
#Plotting the ratio
fig_diff = px.scatter()
for i in selected_genes.index:
    fig_diff.add_trace(go.Scatter(x = [1,3,7,14], y = [abs(ncov_1_avr[i]-nctrl_1_avr[i])/nctrl_1_avr[i],
                                                  abs(ncov_3_avr[i]-nctrl_3_avr[i])/nctrl_3_avr[i],
                                                  abs(ncov_7_avr[i]-nctrl_7_avr[i])/nctrl_7_avr[i],
                                                  abs(ncov_14_avr[i]-nctrl_14_avr[i])/nctrl_14_avr[i]], 
                                                  name = (selected_genes.Gene[i])[-5:]))
fig_diff.update_layout(
    title={
        'x':0.5,
        'text': " Ratio of normalised RNA expression of Covid-normal with time evolution ",
        },
    xaxis_title="Days",
    yaxis_title="Ratio",
    font=dict(
        size=13,
        color="#7f7f7f"
    )
)

fig_diff.update_xaxes(tickvals=[1, 3, 7, 14])
fig_diff.write_html('diff_evolve.html')

#Plotting Selected Covid group time evolution
fig_cov = px.scatter()
for i in selected_genes.index:
    fig_cov.add_trace(go.Scatter(x = [1,3,7,14], y = [ncov_1_avr[i],
                                                  ncov_3_avr[i],
                                                  ncov_7_avr[i],
                                                  ncov_14_avr[i]], 
                                                  name = selected_genes.Gene[i][-5:]))

fig_cov.update_layout(
    title={
        'x':0.5,
        'text': " Normalised RNA expression of Covid group ferrets nasal cell with time evolution ",
        },
    xaxis_title="Days",
    yaxis_title="Gene Count Normalised Value",
    font=dict(
        size=13,
        color="#7f7f7f"
    )
)

fig_cov.update_xaxes(tickvals=[1, 3, 7, 14])
fig_cov.write_html('cov_evolve.html')

#Plotting Selected Control group time evolution

fig_ctrl = px.scatter()
for i in selected_genes.index:
    fig_ctrl.add_trace(go.Scatter(x = [1,3,7,14], y = [nctrl_1_avr[i],
                                                  nctrl_3_avr[i],
                                                  nctrl_7_avr[i],
                                                  nctrl_14_avr[i]], 
                                                  name = (selected_genes.Gene[i])[-5:]))
    
fig_ctrl.update_layout(
    title={
        'x':0.5,
        'text': " Normalised RNA Expression of Control group with time evolution ",
        },
    xaxis_title="Days",
    yaxis_title="Gene Count Normalised Value",
    font=dict(
        size=13,
        color="#7f7f7f"
    )
)

fig_ctrl.update_xaxes(tickvals=[1, 3, 7, 14])
fig_ctrl.write_html('ctrl_evolve.html')

In [7]:
#Getting a list of RNA names from a list of indices:

def get_genenames(sorted_data, num):
    x_genenames = []
    for i in sorted_data.index[:num]:
        x_genenames.append(data.loc[i].Gene)
    return x_genenames

# Chi-square Plot for 10 genes with highest chi-square values between 
# control and covid group averaing across the days

x_genenames = get_genenames(newsorted_dayall,10)
fig_chi = px.bar(x = x_genenames, y = newsorted_dayall['chi'][:10] )
fig_chi.update_layout(
    title={
            'x':0.5,
            'text': " highest 10 Chi-squared values for relevant RNAs on average ",
            },
        xaxis_title="RNA Name",
        yaxis_title="Chi-squared value",
        font=dict(
            size=13,
            color="#7f7f7f"
    ))
fig_chi.write_html('chisquare_alldays.html')

In [8]:
# A combination of bar plots of chi-square sgnificance for each day (4 subplots)
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig_4plots = make_subplots(rows=2, cols=2, subplot_titles=("Day 1", "Day 3", "Day 7", "Day 14"))

genenames_1 = get_genenames(sorted_day1,10)
genenames_3 = get_genenames(sorted_day3,10)
genenames_7 = get_genenames(sorted_day7,10)
genenames_14 = get_genenames(sorted_day14,10)

fig_4plots.add_trace(
    go.Bar(x=genenames_1, y=sorted_day1['chi'][:10]),
    row=1, col=1
)

fig_4plots.add_trace(
    go.Bar(x=genenames_3, y=sorted_day3['chi'][:10]),
    row=1, col=2
)

fig_4plots.add_trace(
    go.Bar(x=genenames_7, y=sorted_day7['chi'][:10]),
    row=2, col=1
)

fig_4plots.add_trace(
    go.Bar(x=genenames_14, y=sorted_day14['chi'][:10]),
    row=2, col=2
)

fig_4plots.update_layout(height=1100, width=1000, 
                         title_text="highest 10 Chi-squared values for relevant RNAs each day", 
                         title_x = 0.5,
                         showlegend = False
                        )

# Update xaxis properties
fig_4plots.update_xaxes(title_text="RNA Names", row=1, col=1)
fig_4plots.update_xaxes(title_text="RNA Names", row=1, col=2)
fig_4plots.update_xaxes(title_text="RNA Names", row=2, col=1)
fig_4plots.update_xaxes(title_text="RNA Names", row=2, col=2)

# Update yaxis properties
fig_4plots.update_yaxes(title_text="Chi-square Value", row=1, col=1)
fig_4plots.update_yaxes(title_text="Chi-square Value", row=2, col=1)

fig_4plots.write_html('1_plot_each_day.html')

In [9]:
genenames_all_long = get_genenames(newsorted_dayall,len(newsorted_dayall))

In [10]:
newsorted_dayall['Gene'] = genenames_all_long
newsorted_dayall['index'] = range(1,len(newsorted_dayall)+1)
newsorted_dayall =newsorted_dayall[['index','Gene','control_dayall','covid_dayall','chi']]