In [None]:
# Group 33, Florida Atlantic University
# Fold change model for feature extraction
# 11/19/24

import numpy as np
import pandas as pd

# Extracting Data 
labeled_miRNA_data = pd.read_csv('../processed_data/miRNA_stage_subtype.csv')


In [3]:
labeled_miRNA_data.head(5)

Unnamed: 0,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b,stage,subtype
0,7314.747386,7391.483138,7334.393081,10994.201497,471.496698,318.193106,1156.241547,3272.099771,3363.611772,442.783758,...,0.0,0.0,1.847031,0,40.298863,35.429417,148.602058,12118.707689,1,2
1,9518.042994,9460.443528,9574.874468,17578.281899,785.810318,358.652676,771.986446,3871.452122,3917.224498,487.829079,...,0.0,128.562009,4.607957,0,8.60152,38.86044,111.512567,7471.802757,1,2
2,4479.97634,4387.407628,4447.955716,12394.31011,404.624244,855.241747,246.267705,1353.016896,1415.311564,416.8503,...,0.0,161.267504,1.746579,0,33.767203,31.43843,168.253822,16026.613214,1,2
3,21277.962603,21166.590502,21255.800397,15161.474118,6684.570363,503.278464,2185.922959,15012.229891,14987.262342,1107.549261,...,0.0,1.683206,10.660302,0,5.049617,95.101114,1416.978551,12750.562682,1,2
4,8002.355461,8013.396682,8033.638922,19358.942067,1276.411235,765.754731,593.005616,2630.801098,2649.43316,367.580673,...,0.0,97.990843,3.450382,0,22.77252,46.235116,455.450396,14401.203493,1,2


In [None]:
# Averaging the rpm for each miRNA based on the stage (0 == healthy/negative)
rpm_avg_across_stages = labeled_miRNA_data.iloc[: ,:  -1].groupby('stage').mean()

positive_cases = rpm_avg_across_stages[1:]

# Averaging the averages of unhealthy/posistive cases
rpm_avg_posistive_cases = positive_cases.mean().to_frame().transpose()

rpm_avg_negative_posistive = pd.concat([rpm_avg_across_stages.iloc[:1], rpm_avg_posistive_cases.iloc[:1]], ignore_index = True).transpose()

rpm_avg_negative_posistive.head(5)

Unnamed: 0,0,1
hsa-let-7a-1,9364.865318,9854.35425
hsa-let-7a-2,9319.533377,9839.225444
hsa-let-7a-3,9399.771735,9885.311293
hsa-let-7b,15241.225121,17199.333693
hsa-let-7c,1979.238756,1550.109942


In [83]:
def log_fold_change(miRNA, negative, posistive):
    # miRNAs with low regulation averaged 0 for the negative dataset
    if negative == 0: 
        print(f'Warning divide by zero on : \n{miRNA}')
        print(f'Negative: {negative}')
        print(f'Posistive: {posistive}\n')
        return 0
    # Adding a small value to avoid divide by zero errors
    negative += 0.0001
    posistive += 0.0001
    return np.log2(posistive / negative)


In [None]:
rpm_log_fold_change = {}

for idx, row in rpm_avg_negative_posistive.iterrows():
    rpm_log_fold_change[row.name] = log_fold_change(row.name, row[0], row[1])

hsa-mir-103b-1
Negative: 0.0
Posistive: 0.0

hsa-mir-103b-2
Negative: 0.0
Posistive: 0.0

hsa-mir-1182
Negative: 0.0
Posistive: 0.008314896875430224

hsa-mir-1183
Negative: 0.0
Posistive: 0.0

hsa-mir-1184-1
Negative: 0.0
Posistive: 0.00013165503246753249

hsa-mir-1184-2
Negative: 0.0
Posistive: 0.00042385740282685513

hsa-mir-1184-3
Negative: 0.0
Posistive: 0.00011577759740259739

hsa-mir-1193
Negative: 0.0
Posistive: 0.011515541323711624

hsa-mir-1199
Negative: 0.0
Posistive: 0.003159557647767427

hsa-mir-1200
Negative: 0.0
Posistive: 0.0

hsa-mir-1202
Negative: 0.0
Posistive: 0.0

hsa-mir-1203
Negative: 0.0
Posistive: 0.00048761980290027996

hsa-mir-1204
Negative: 0.0
Posistive: 0.0007793434990362993

hsa-mir-1205
Negative: 0.0
Posistive: 0.0

hsa-mir-1206
Negative: 0.0
Posistive: 0.00012312941696113072

hsa-mir-1207
Negative: 0.0
Posistive: 0.0007800698071451517

hsa-mir-1208
Negative: 0.0
Posistive: 0.0

hsa-mir-1233-1
Negative: 0.0
Posistive: 0.0

hsa-mir-1233-2
Negative: 0.0
Pos

In [86]:
rpm_log_fold_change = pd.DataFrame.from_dict(rpm_log_fold_change, orient='index')

rpm_log_fold_change.head(5)

Unnamed: 0,0
hsa-let-7a-1,0.073503
hsa-let-7a-2,0.078287
hsa-let-7a-3,0.072661
hsa-let-7b,0.174374
hsa-let-7c,-0.352575


In [96]:
rpm_log_fold_change_sorted = rpm_log_fold_change.sort_values(by = 0, ascending=False)

print("Top 10 most upregualted genes in lung cancer paitents")
rpm_log_fold_change_sorted.head(10)

Top 10 most upregualted genes in lung cancer paitents


Unnamed: 0,0
hsa-mir-520a,8.776809
hsa-mir-520f,7.479981
hsa-mir-518c,7.341205
hsa-mir-516b-1,6.97444
hsa-mir-512-1,6.891411
hsa-mir-525,6.885598
hsa-mir-518a-2,6.59447
hsa-mir-518b,6.498128
hsa-mir-520c,6.482026
hsa-mir-526b,6.481238


In [95]:
print("Top 10 most downregulated genes in lung cancer paitents")
rpm_log_fold_change_sorted.tail(10)

Top 10 most downregulated genes in lung cancer paitents


Unnamed: 0,0
hsa-mir-4439,-2.762795
hsa-mir-4510,-2.777789
hsa-mir-663b,-2.855177
hsa-mir-3937,-2.910867
hsa-mir-8085,-2.992225
hsa-mir-1282,-3.62563
hsa-mir-3907,-3.88802
hsa-mir-631,-4.13724
hsa-mir-4663,-6.025543
hsa-mir-571,-6.052735
