In [None]:
# IMPORTS
import pandas as pd
import numpy as np
import os

Fragment of code to parse the CRC.SW.mRNA.symbol.count.txt file, correct its formatting, and save it as a csv that can be read straight in the analysis.

In [None]:
## LOAD TRANSCRIPTOMICS DATA AND DO BASIC FORMATTING.
## GENERATE CSV FOR FURTHER ANALYSIS.

## Move out of the notebook folder to access datasets
working_dir = os.getcwd()
working_dir = working_dir.strip('notebooks')

# Prepare all the directories and files
data_dir = working_dir + 'data/PROTECTED_DATA/BGI_Expression_Data/'
mrna_count = data_dir + 'CRC.SW.mRNA.symbol.count.txt'

mrna_count_raw = pd.read_csv(mrna_count, sep="\t", index_col=0)
mrna_count_raw = mrna_count_raw.T
mrna_count_raw.reset_index(inplace=True)
mrna_count_raw.rename(columns={'index':'Sample_ID'}, inplace=True)


## Save the dataset as a file with correct formating
correctformating = data_dir + 'CRC.SW.mRNA.symbol.count.csv'
mrna_count_raw.to_csv(correctformating, sep=',', index=False) 

Read and extract a subset of traits from the key-table, in order to select what clinical traits we want to study.

In [None]:
## LOAD KEY-DATA AND SELECT CLINICAL TRAITS

## Move out of the notebook folder to access datasets
working_dir = os.getcwd()
working_dir = working_dir.strip('notebooks')

# Prepare all the directories and files
data_dir = working_dir + 'data/PROTECTED_DATA/BGI_Expression_Data/'
key_table_dir = data_dir + 'Supplementary_Table_01.xlsx'


## Load the datasets and fix formatting
key_table = pd.read_excel(key_table_dir, header=2)
key_table.head()

## Subset the clincal traits we want to study
key_table_subset_tumor = key_table[['RNA Tumor Sample Barcode', 'Gender', 'Primary Site Disease', 
                              'Histology Subtype', 'Tumour Stage']]
key_table_subset_tumor = key_table_subset_tumor.rename(columns={'RNA Tumor Sample Barcode':'Sample_ID'})


key_table_subset_normal = key_table[['RNA Normal Sample Barcode', 'Gender', 'Primary Site Disease', 
                              'Histology Subtype', 'Tumour Stage']]
key_table_subset_normal = key_table_subset_normal[key_table_subset_normal['RNA Normal Sample Barcode'] != 'Not_Applicable']  # We delete all the rows that are actually only Tumor
key_table_subset_normal = key_table_subset_normal.rename(columns={'RNA Normal Sample Barcode':'Sample_ID'})


key_table_subset = pd.concat([key_table_subset_tumor, key_table_subset_normal])


## Save the dataset as a file with correct formating
correctformating = data_dir + 'Sample_Info_Selection.csv'
key_table_subset.to_csv(correctformating, sep=',', index=False, encoding='utf-8')

key_table_subset

CHECK RANGE OF VALUES PER VARIABLE TO ASSIGN COLORS IN WGCNA

In [None]:
## Get the values of each trait for coloring in WGCNA analysis
sample_info_selection = data_dir + 'Sample_Info_Selection.csv'
sample_info_tes = pd.read_csv(sample_info_selection)

# Use set() to eliminate duplicate values in column 'C'
unique_values_set = set(sample_info_tes['Tumour Stage'])
 
# Print the unique values
print(unique_values_set)

ANALYSIS OF THE INEQUAL NUMBER OF SAMPLES IN DATASET AND EXCEL
To decide if all (tumor and normal) must beincluded, or if they are the same

In [None]:
### LOADING REAL UNPUBLISHED DATA    -     NO PUSHING FOR THE RESULTS

## Move out of the notebook folder to access datasets
working_dir = os.getcwd()
working_dir = working_dir.strip('notebooks')

# Prepare all the files for loading
data_dir = working_dir + 'data/PROTECTED_DATA/BGI_Expression_Data/'
mrna_count = data_dir + 'CRC.SW.mRNA.symbol.count.csv'
sample_info_selection = data_dir + 'Sample_Info_Selection.csv'

In [None]:
genee = pd.read_csv(mrna_count)  
genee

In [None]:
test = pd.read_csv(sample_info_selection)  
test

In [None]:
## Check if all samples .T and .N need to be included separetly

def find_rows_by_value(df, column_name, value1, value2):
    # Find rows with the specified values in the given column
    rows = df[df[column_name].isin([value1, value2])]
    
    # Check if we found exactly two rows
    if len(rows) != 2:
        return "Did not find exactly two rows with the specified values"
    
    # Proceed to compare these two rows
    return compare_rows_directly(rows.iloc[0], rows.iloc[1])

def compare_rows_directly(row1, row2):
    # Initialize an empty dictionary to hold columns with different values
    differences = {}
    
    # Compare each column in the two rows
    for column in row1.index:  # row1.index contains the column names
        if row1[column] != row2[column]:
            # If values are different, add them to the differences dictionary
            differences[column] = (row1[column], row2[column])
    
    # Return True if no differences, otherwise return the differences
    return True if not differences else differences



# Use set() to eliminate duplicate values in column 'C'
unique_values_dataset = set(genee['Sample_ID'])
unique_values_excel = set(test['Sample_ID'])

 
# Compare the samples that are from healthy and cancer tissue from the same pacients, to decide if they are worth including
print(list(unique_values_dataset - unique_values_excel))

result = find_rows_by_value(genee, 'Sample_ID', 'CRC.SW.U3032.N', 'CRC.SW.U3032.T')
print(result)