In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
import os
import matplotlib.pyplot as plt
from google.colab import files

In [4]:
# Create an ExcelFile object
file = pd.ExcelFile('/content/A549_A16_treatment_660126.xlsx')
sheet_names = file.sheet_names
print(sheet_names)

['Area - ions', 'Area - peptides', 'Area - proteins', 'MLR normalization', 't-test', 'sig.', 'Score', 'FDR', 'Observed RT']


In [5]:
df = pd.read_excel("/content/A549_A16_treatment_660126.xlsx", sheet_name='t-test')

In [6]:
df.to_csv("A549_A16_treatment_660126_t-test.csv", index = None, header =True)
df = pd.read_csv("A549_A16_treatment_660126_t-test.csv")

In [7]:
df.head(1)

Unnamed: 0,Peak Name,m/z,Ret. Time,Group,Use,t-value,p-value,Log (p),Mean 1,Mean 2,Median 1,Median 2,Sigma 1,Sigma 2,Delta,Fold Change,Log (Fold Change)
0,sp|P23284|PPIB_HUMAN,,,Peptidyl-prolyl cis-trans isomerase B OS=Homo ...,True,27.595672,1e-05,-4.982059,114063.030113,5440.307229,116040.817729,6737.091935,4717.824953,4921.756404,108622.722885,20.9662846821696,1.32152147839156


In [8]:
df.shape

(739, 17)

In [9]:
df1 = df[pd.to_numeric(df['Fold Change'], errors='coerce').notnull()]

In [10]:
df1.shape

(690, 17)

In [None]:
df1 = df1.copy()
df1['Fold Change'] = pd.to_numeric(df1['Fold Change'])
df1['log2(fold change)'] = df1['Fold Change'].apply(np.log2)
df1['"-log10(p-value)"'] = -1 * df1['p-value'].apply(np.log10)

In [None]:
df1.head(1)

In [None]:
df1['Protein'] = df1['Peak Name'].str.split('|').str[2].str.split('_').str[0]

In [None]:
df1.head(1)

In [None]:
# filter rows where p-value is less than 0.05
significant_peaks = df1[df1['p-value'] < 0.05]

# set up plot
fig, ax = plt.subplots()
ax.scatter(df1['log2(fold change)'], df1['"-log10(p-value)"'], color='blue', label='Non-significant')
ax.scatter(significant_peaks['log2(fold change)'], significant_peaks['"-log10(p-value)"'], color='red', label='Significant')
ax.axhline(y=-np.log10(0.05), color='gray', linestyle='--')
ax.axvline(x=1, color='gray', linestyle='--')
ax.axvline(x=-1, color='gray', linestyle='--')
ax.set_xlabel('log2(fold change)')
ax.set_ylabel('-log10(p-value)')
ax.set_title('Volcano plot')
ax.legend()

# add text labels for significant peaks
for idx, row in significant_peaks.iterrows():
    ax.text(row['log2(fold change)'], row['"-log10(p-value)"'], row['Protein'], fontsize=6)

# show plot
#plt.show()
plt.savefig('volcano_plot.png', dpi=300, bbox_inches='tight')

In [None]:
df1_sp = df1[df1['p-value'] < 0.05].sort_values(by=['p-value'], ascending=True)
df1_sp.to_csv('significant_peaks_remove_zero.csv', index=False, header=True)

In [None]:
df_sp = df[df['p-value'] < 0.05].sort_values(by=['p-value'], ascending=True)
df_sp.to_csv('significant_peaks.csv', index=False, header=True)

In [None]:
df_MLR = pd.read_excel("/content/A549_A16 treatment_660126.xlsx", sheet_name='MLR normalization')

In [None]:
df_MLR.to_csv("A549_A16_treatment_660126_MLR.csv", index = None, header =True)
df_MLR = pd.read_csv("A549_A16_treatment_660126_MLR.csv")

In [None]:
df_MLR.head(5)

In [None]:
df_MLR = df_MLR.dropna(subset=[df_MLR.columns[1]])

In [None]:
df_MLR.head(4)

In [None]:
df_MLR['A16treatment_mean'] = df_MLR[['A549_A16treatment_SWATH_1', 'A549_A16treatment_SWATH_2', 'A549_A16treatment_SWATH_3']].mean(axis=1)
df_MLR['nontreatment_mean'] = df_MLR[['A549_nontreatment_SWATH_1', 'A549_nontreatment_SWATH_2', 'A549_nontreatment_SWATH_3']].mean(axis=1)

In [None]:
from scipy.stats import ttest_ind
df_MLR['t_test'], df_MLR['p_value'] = ttest_ind(df_MLR[['A549_A16treatment_SWATH_1', 'A549_A16treatment_SWATH_2', 'A549_A16treatment_SWATH_3']], df_MLR[['A549_nontreatment_SWATH_1', 'A549_nontreatment_SWATH_2', 'A549_nontreatment_SWATH_3']], axis=1)

In [None]:
df_MLR.head(4)

In [None]:
df_MLR.shape

In [None]:
df_MLRrm = df_MLR[df_MLR['nontreatment_mean'] != 0]

In [None]:
df_MLRrm.shape

In [None]:
df_MLRrm = df_MLRrm.copy()
df_MLRrm['FC_ACP/contrl'] = df_MLRrm['A16treatment_mean']/df_MLR['nontreatment_mean']
df_MLRrm['log2(FC_ACP/contrl)'] = df_MLRrm['FC_ACP/contrl'].apply(np.log2)
df_MLRrm['"-log10(p_value)"'] = -1 * df_MLRrm['p_value'].apply(np.log10)
df_MLRrm['Protein'] = df_MLRrm['Sample Name'].str.split('|').str[2].str.split('_').str[0]

In [None]:
df_MLRrm.head(4)

In [None]:
df_MLRrm.columns

In [None]:
significant_peaks = df_MLRrm[df_MLRrm['p_value'] < 0.05]

# set up plot
fig, ax = plt.subplots()
ax.scatter(df_MLRrm['log2(FC_ACP/contrl)'], df_MLRrm['"-log10(p_value)"'], color='blue', label='Non-significant')
ax.scatter(significant_peaks['log2(FC_ACP/contrl)'], significant_peaks['"-log10(p_value)"'], color='red', label='Significant')
ax.axhline(y=-np.log10(0.05), color='gray', linestyle='--')
ax.axvline(x=1, color='gray', linestyle='--')
ax.axvline(x=-1, color='gray', linestyle='--')
ax.set_xlabel('log2(fold change)')
ax.set_ylabel('-log10(p-value)')
ax.set_title('Volcano plot')
ax.legend()

# add text labels for significant peaks
for idx, row in significant_peaks.iterrows():
    ax.text(row['log2(FC_ACP/contrl)'], row['"-log10(p_value)"'], row['Protein'], fontsize=6)

# show plot
#plt.show()
plt.savefig('volcano_plot_manual.png', dpi=300, bbox_inches='tight')

In [None]:
df_MLRrm_sp = df_MLRrm[df_MLRrm['p_value'] < 0.05].sort_values(by=['p_value'], ascending=True)
df_MLRrm_sp.to_csv('significant_peaks_remove_zero_MLR.csv', index=False, header=True)

In [None]:
df_MLR_sp = df_MLR[df_MLR['p_value'] < 0.05].sort_values(by=['p_value'], ascending=True)
df_MLR_sp.to_csv('significant_peaks_MLR.csv', index=False, header=True)

In [None]:
# Set the name of the zip file you want to create
zip_file_name = 'outputs.zip'

# Create a list of all the files in the current directory
file_list = os.listdir()

# Create an empty list to store the paths of the output files
output_file_paths = []

# Loop through each file in the file list
for file_name in file_list:
  
  # Check if the file is a .csv or .png file
  if file_name.endswith('.csv') or file_name.endswith('.png'):
    
    # Add the file path to the list of output file paths
    output_file_paths.append(os.path.join(os.getcwd(), file_name))

# If there are output files, create a zip file and download it
if len(output_file_paths) > 0:
  
  # Create a zip file of the output files
  !zip -r $zip_file_name {' '.join(output_file_paths)}
  
  # Download the zip file
  files.download(zip_file_name)
  
else:
  print('No output files found')
