In [9]:
# IMPORTS
import pandas as pd
import numpy as np
import os

Fragment of code to parse the CRC.SW.mRNA.symbol.count.txt file, correct its formatting, and save it as a csv that can be read straight in the analysis.

In [10]:
## LOAD TRANSCRIPTOMICS DATA AND DO BASIC FORMATTING.
## GENERATE CSV FOR FURTHER ANALYSIS.

## Move out of the notebook folder to access datasets
working_dir = os.getcwd()
working_dir = working_dir.strip('notebooks')

# Prepare all the directories and files
data_dir = working_dir + 'data/PROTECTED_DATA/BGI_Expression_Data/'
mrna_count = data_dir + 'CRC.SW.mRNA.symbol.count.txt'

mrna_count_raw = pd.read_csv(mrna_count, sep="\t", index_col=0)
mrna_count_raw = mrna_count_raw.T
mrna_count_raw.reset_index(inplace=True)
mrna_count_raw.rename(columns={'index':'Sample_ID'}, inplace=True)


## Save the dataset as a file with correct formating
correctformating = data_dir + 'CRC.SW.mRNA.symbol.count.csv'
mrna_count_raw.to_csv(correctformating, sep=',', index=False) 

Read and extract a subset of traits from the key-table, in order to select what clinical traits we want to study.

In [16]:
## LOAD KEY-DATA AND SELECT CLINICAL TRAITS

## Move out of the notebook folder to access datasets
working_dir = os.getcwd()
working_dir = working_dir.strip('notebooks')

# Prepare all the directories and files
data_dir = working_dir + 'data/PROTECTED_DATA/BGI_Expression_Data/'
key_table_dir = data_dir + 'Supplementary_Table_01.xlsx'


## Load the datasets and fix formatting
key_table = pd.read_excel(key_table_dir, header=2)
key_table.head()

## Subset the clincal traits we want to study
key_table_subset_tumor = key_table[['RNA Tumor Sample Barcode', 'Gender', 'Primary Site Disease', 
                              'Histology Subtype', 'Tumour Stage']]
key_table_subset_tumor = key_table_subset_tumor.rename(columns={'RNA Tumor Sample Barcode':'Sample_ID'})


key_table_subset_normal = key_table[['RNA Normal Sample Barcode', 'Gender', 'Primary Site Disease', 
                              'Histology Subtype', 'Tumour Stage']]
key_table_subset_normal = key_table_subset_normal[key_table_subset_normal['RNA Normal Sample Barcode'] != 'Not_Applicable']  # We delete all the rows that are actually only Tumor
key_table_subset_normal = key_table_subset_normal.rename(columns={'RNA Normal Sample Barcode':'Sample_ID'})


key_table_subset = pd.concat([key_table_subset_tumor, key_table_subset_normal])


## Save the dataset as a file with correct formating
correctformating = data_dir + 'Sample_Info_Selection.csv'
key_table_subset.to_csv(correctformating, sep=',', index=False, encoding='utf-8')

key_table_subset

Unnamed: 0,Sample_ID,Gender,Primary Site Disease,Histology Subtype,Tumour Stage
0,CRC.SW.U0001.T,Female,Rectum,Adenocarcinoma,Stage II
1,CRC.SW.U0002.T,Female,Colon,Adenocarcinoma,Stage III
2,CRC.SW.U0004.T,Female,Colon,Adenocarcinoma,Stage II
3,CRC.SW.U0005.T,Female,Colon,Adenocarcinoma,Stage III
4,CRC.SW.U0006.T,Male,Colon,Adenocarcinoma,Stage III
...,...,...,...,...,...
935,CRC.SW.UM043.N,Male,Colon,Adenocarcinoma,Stage I
936,CRC.SW.UM044.N,Male,Colon,Adenocarcinoma,Stage II
937,CRC.SW.UM045.N,Male,Colon,Adenocarcinoma,Stage III
941,CRC.SW.UM049.N,Male,Colon,Adenocarcinoma,Stage II


CHECK RANGE OF VALUES PER VARIABLE TO ASSIGN COLORS IN WGCNA

In [None]:
## Get the values of each trait for coloring in WGCNA analysis
sample_info_selection = data_dir + 'Sample_Info_Selection.csv'
sample_info_tes = pd.read_csv(sample_info_selection)

# Use set() to eliminate duplicate values in column 'C'
unique_values_set = set(sample_info_tes['Tumour Stage'])
 
# Print the unique values
print(unique_values_set)

ANALYSIS OF THE INEQUAL NUMBER OF SAMPLES IN DATASET AND EXCEL
To decide if all (tumor and normal) must beincluded, or if they are the same

In [12]:
### LOADING REAL UNPUBLISHED DATA    -     NO PUSHING FOR THE RESULTS

## Move out of the notebook folder to access datasets
working_dir = os.getcwd()
working_dir = working_dir.strip('notebooks')

# Prepare all the files for loading
data_dir = working_dir + 'data/PROTECTED_DATA/BGI_Expression_Data/'
mrna_count = data_dir + 'CRC.SW.mRNA.symbol.count.csv'
sample_info_selection = data_dir + 'Sample_Info_Selection.csv'

In [13]:
genee = pd.read_csv(mrna_count)  
genee

Unnamed: 0,Sample_ID,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,AC118549.1
0,CRC.SW.U0001.T,12,1899,43569,1,0,544,0,2168,1529,...,664,1546,335,1096,1780,6,3092,5639,5034,1922
1,CRC.SW.U0002.T,3,2448,16953,3,0,468,1,1262,1464,...,260,1130,336,1009,1862,2,1964,7174,7733,1393
2,CRC.SW.U0004.T,14,2353,12739,1,0,254,2,1914,1694,...,591,1111,459,1703,1659,0,1963,5955,5022,1665
3,CRC.SW.U0005.T,2,789,22840,7,0,330,0,1123,695,...,271,333,343,1050,1089,6,1605,4749,2827,1790
4,CRC.SW.U0006.T,2,571,5547,3,0,300,0,1070,739,...,159,374,168,495,845,6,797,5833,1954,1016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1178,CRC.SW.UM169.T,0,2017,7267,2,0,219,0,2099,2282,...,1131,1914,372,1057,1425,4,2294,6958,7091,2029
1179,CRC.SW.UM170.T,0,1883,6542,2,0,79,0,1496,2168,...,427,702,289,1008,1340,0,1468,5062,5448,1793
1180,CRC.SW.UM171.T,0,2097,1952,11,2,192,0,1122,1840,...,402,784,309,987,1162,2,1147,3962,3890,1498
1181,CRC.SW.UM172.T,0,1124,8560,8,0,341,2,1324,1353,...,814,830,440,1546,1449,6,2939,4881,3569,1797


In [14]:
test = pd.read_csv(sample_info_selection)  
test

Unnamed: 0,index,Sample_ID,Gender,Primary Site Disease,Histology Subtype,Tumour Stage
0,0,CRC.SW.U0001.T,Female,Rectum,Adenocarcinoma,Stage II
1,1,CRC.SW.U0002.T,Female,Colon,Adenocarcinoma,Stage III
2,2,CRC.SW.U0004.T,Female,Colon,Adenocarcinoma,Stage II
3,3,CRC.SW.U0005.T,Female,Colon,Adenocarcinoma,Stage III
4,4,CRC.SW.U0006.T,Male,Colon,Adenocarcinoma,Stage III
...,...,...,...,...,...,...
1178,935,CRC.SW.UM043.N,Male,Colon,Adenocarcinoma,Stage I
1179,936,CRC.SW.UM044.N,Male,Colon,Adenocarcinoma,Stage II
1180,937,CRC.SW.UM045.N,Male,Colon,Adenocarcinoma,Stage III
1181,941,CRC.SW.UM049.N,Male,Colon,Adenocarcinoma,Stage II


In [None]:
## Check if all samples .T and .N need to be included separetly

def find_rows_by_value(df, column_name, value1, value2):
    # Find rows with the specified values in the given column
    rows = df[df[column_name].isin([value1, value2])]
    
    # Check if we found exactly two rows
    if len(rows) != 2:
        return "Did not find exactly two rows with the specified values"
    
    # Proceed to compare these two rows
    return compare_rows_directly(rows.iloc[0], rows.iloc[1])

def compare_rows_directly(row1, row2):
    # Initialize an empty dictionary to hold columns with different values
    differences = {}
    
    # Compare each column in the two rows
    for column in row1.index:  # row1.index contains the column names
        if row1[column] != row2[column]:
            # If values are different, add them to the differences dictionary
            differences[column] = (row1[column], row2[column])
    
    # Return True if no differences, otherwise return the differences
    return True if not differences else differences



# Use set() to eliminate duplicate values in column 'C'
unique_values_dataset = set(genee['Sample_ID'])
unique_values_excel = set(test['Sample_ID'])

 
# Compare the samples that are from healthy and cancer tissue from the same pacients, to decide if they are worth including
print(list(unique_values_dataset - unique_values_excel))

result = find_rows_by_value(genee, 'Sample_ID', 'CRC.SW.U3032.N', 'CRC.SW.U3032.T')
print(result)