1. All the search results were combined, and then the micropeptide repeats were combined to integrate as many expressions as possible

In [1]:
import pandas as pd


df = pd.read_csv('../02p_exp/combine_all.csv', sep=',', header=None)

# search duplicates
duplicates = df[df.duplicated(subset=0, keep=False)]

# Merge duplicates
merged_duplicates = duplicates.groupby(0).apply(lambda x: x.fillna(method='ffill').fillna(method='bfill')).drop_duplicates()

# update df
df = pd.concat([df[~df[0].isin(duplicates[0])], merged_duplicates])

# check duplicates
duplicates_after_merge = df[df.duplicated(subset=0, keep=False)]

# if there are still duplicates, keep the first one
df = df.drop_duplicates(subset=0, keep='first')

df.to_csv('../02p_exp/sample_raw_uni.csv', sep=',', header=None, index=None)

  df = pd.read_csv('../02p_exp/combine_all.csv', sep=',', header=None)
  merged_duplicates = duplicates.groupby(0).apply(lambda x: x.fillna(method='ffill').fillna(method='bfill')).drop_duplicates()


2. Assign the sample name to each column

In [3]:
import csv

sample_names = []
with open('../../source/PDC_study_experimental.csv', 'r') as file:
    reader = csv.reader(file, delimiter=',')
    next(reader)  # Skip header
    for row in reader:
        values = row[3:13]
        for value in values:
            split_values = value.split('\n')
            first_split = split_values[0]
            last_split = split_values[-1].split(' ')[-1]
            extracted_value = first_split + '_' + last_split
            sample_names.append(extracted_value)

# Replace column names
with open('../02p_exp/sample_raw_uni.csv', 'r') as file:
    reader = csv.reader(file, delimiter=',')
    rows = list(reader)
rows[0][1:] = sample_names

with open('../02p_exp/sample_raw_case.csv', 'w', newline='') as file:
    writer = csv.writer(file, delimiter=',')
    writer.writerows(rows)

3. Tumor Normal was grouped and sorted and orderedd, and mat missing values were filled

In [None]:
import pandas as pd

df = pd.read_csv('~/test.txt',sep="\t")

tumor_columns = [col for col in df.columns if 'Tumor' in col]
normal_columns = [col for col in df.columns if 'Normal' in col]

# order： normal_columns + tumor_columns
new_columns =  normal_columns + tumor_columns
new_df = df[new_columns]

new_df.to_csv('/Users/suxinwan/Desktop/test-0325.csv', index=False)

# print the number of columns containing 'Tumor' and 'Normal'
print("Tumor number：", len(tumor_columns))
print("Normal number：", len(normal_columns))

4. Differential expression analysis

In [None]:
import pandas as pd
from scipy import stats
from statsmodels.stats import multitest
import numpy as np

data = pd.read_csv('~/test.csv',index_col=0)
sample_columns = data.filter(regex='Tumor|Normal').columns

result_data = pd.DataFrame(columns=['orf_id', 'T-Statistic', 'P-Value', 'Log2FC', 'FDR'])

# store logFC
logfc_values = pd.Series(index=data.index)
 
# t.test 
for _, row in data.iterrows():

    normal_data = row[sample_columns].values[:11]
    tumor_data = row[sample_columns].values[12:]

    t_statistic, p_value = stats.ttest_ind(tumor_data, normal_data)

    # calculate logFC
    tumor_mean = np.mean(tumor_data)
    normal_mean = np.mean(normal_data)
    fc = np.log2(tumor_mean / (normal_mean+0.0001))

    logfc_values[row.name] = fc

    # add result
    result_data = pd.concat([result_data, pd.DataFrame({'orf_id': [row.name], 'T-Statistic': [t_statistic], 'P-Value': [p_value], 'Log2FC': [fc]})], ignore_index=True)

# calculate FDR
p_values = result_data['P-Value']
fdr = multitest.multipletests(p_values, method='fdr_bh')[1]

# add result to result_data
result_data['FDR'] = fdr
result_data.to_csv("~/de_sample.csv", index=False)

In [3]:
import pandas as pd
from pyensembl import EnsemblRelease

# load ensembl
ensembl = EnsemblRelease(110)

df = pd.read_csv('../02p_exp/de_sample.csv')

# select p < 0.05
significant_rows = df[df['P-Value'] < 0.05]

first_column = significant_rows.iloc[:, 0]
split_values = first_column.str.split('_')

ensg_ids = []
for split_value in split_values:
    try:
        enst_id = split_value[0]  
        ensg_id = ensembl.transcript_by_id(enst_id).gene_id
    except Exception as e:
        ensg_id = 'NA'
    ensg_ids.append(ensg_id)

# add ENSG
significant_rows['ENSG_ID'] = ensg_ids

significant_rows.to_csv('../02p_exp/de_sample_with_ensg.csv', index=False)


保存成功！


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  significant_rows['ENSG_ID'] = ensg_ids


5. The expression of differential RNA

In [5]:
import pandas as pd

data = pd.read_csv("../02p_exp/sample_de_order.csv")

orf_id_data = data[data["orf_id"].notnull()]

# select tumor
selected_columns = ["orf_id"] + [col for col in data.columns if "Tumor" in col]
orf_id_tumor_data = orf_id_data[selected_columns]

subset_list = pd.read_csv("../04co_exp/ucec_de_list.txt", sep="\t")

# merge data
result_data = pd.merge(orf_id_tumor_data, subset_list, on="orf_id")
result_data.to_csv("../04co_exp/de_case_exp.csv", index=False)

# read count matrix
count_matrix = pd.read_csv("../03r_exp/case_COUNT_matrix.csv")
count_matrix['ENSG_ID'] = count_matrix['ENSG_ID'].apply(lambda x: x.split('.')[0])

intersect_ensg = subset_list['ENSG_ID']

# select matched rows
matched_rows = count_matrix[count_matrix['ENSG_ID'].isin(intersect_ensg)]
matched_rows.to_csv("../04co_exp/matched_count_matrix.csv", index=False)

In [None]:
import pandas as pd

# obtain ENSG ID values
df1 = pd.read_csv('../02p_exp/de_sample_with_ensg.csv')
ensg_ids = df1['ENSG_ID'].tolist()

df2 = pd.read_csv('../04co_exp/matched_count_matrix.csv')

new_rows = []
for ensg_id in ensg_ids:
    rows_to_copy = df2[df2['ENSG_ID'] == ensg_id]
    for index, row in rows_to_copy.iterrows():
        new_row = row.to_list()
        new_row.insert(0, ensg_id)
        new_rows.append(new_row)

new_df = pd.DataFrame(new_rows, columns=['ENSG_ID'] + df2.columns.tolist())
new_df.to_csv('../04co_exp/matched_out_matrix_updated.csv', index=False)


In [None]:
import pandas as pd
import scipy.stats as stats

protein_file = "../04co_exp/de_p_exp.csv"
protein_df = pd.read_csv(protein_file, sep=",")

# read rna expression data
rna_file = "../04co_exp/de_r_exp.csv"
rna_df = pd.read_csv(rna_file, sep=",")

# select value for calculation
genes = protein_df.columns[2:]

# store result
result_df = pd.DataFrame(columns=["Gene", "R","P"])

# caluclate correlation value
for gene in genes:
    protein_tumor = protein_df[gene]
    rna_tumor = rna_df[gene]
    rna_data = rna_df[gene]
    
    #protein_tumor_rna_tumor_correlation, tumor_pvalue = stats.spearmanr(protein_tumor, rna_tumor)
    protein_tumor_rna_tumor_correlation, tumor_pvalue = stats.pearsonr(protein_tumor, rna_tumor)   
    result_df.loc[len(result_df)] = [gene, protein_tumor_rna_tumor_correlation,tumor_pvalue]


result_file = "../04co_exp/correlation_results.csv"
result_df.to_csv(result_file, index=False)
