In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import re

In [11]:
df = pd.read_csv("datasets/for_regr_descriptors_full.csv")

In [12]:
df

Unnamed: 0,id,sequence,extra_name,cpp_category,is_cpp,cpp_type,origin,id_uptake,peptide,uptake_type,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,2721,(Acp)-KKKKKRFSFKKSFKLSGFSFKKNKK,,,True,,,938.0,2721.0,Fluorescence intensity,...,0,0,0,0,0,0,0,0,12,0
1,2724,(Acp)-RKRRQTSMTDFYHSKRRLIFS,,,True,,,969.0,2724.0,Fluorescence intensity,...,1,0,0,0,0,0,0,0,2,0
2,2630,?A-RR-[KRRRRRE],,,True,,,720.0,2630.0,Relative Mean Fluorescence intensity (%),...,0,0,0,0,0,0,0,0,1,0
3,2629,?A-RRR-[KRRRRE],,,True,,,719.0,2629.0,Relative Mean Fluorescence intensity (%),...,0,0,0,0,0,0,0,0,1,0
4,2628,?A-RRRR-[KRRRE],,,True,,,718.0,2628.0,Relative Mean Fluorescence intensity (%),...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
872,2317,rRrGrKkRr,cTAT-GFP,,True,,,525.0,2317.0,Cellular uptake (%),...,0,0,0,0,0,0,0,0,2,0
873,2324,rrrrrrrrrrrrGC,r12,,True,,,621.0,2324.0,Relative fluorescence (%),...,0,0,0,0,0,0,0,0,0,0
874,3022,stearly-His6-HHHHHHHHHHHHHHHH–NH2,,,True,,,1802.0,3022.0,Relative cellular uptake (%),...,0,0,0,0,0,0,0,0,0,0
875,2714,stearyl-HHHHHHHHHHHHHHHH-RRRRRRRR-NH2,,,True,,,924.0,2714.0,Mean Fluorescence intensity,...,0,0,0,0,0,0,0,0,0,0


In [13]:
missing_values = df.isnull().sum()
print(missing_values)

id                    0
sequence              0
extra_name          841
cpp_category        707
is_cpp                0
                   ... 
fr_thiazole           0
fr_thiocyan           0
fr_thiophene          0
fr_unbrch_alkane      0
fr_urea               0
Length: 261, dtype: int64


In [14]:
df[['raw_efficiency']].describe()


Unnamed: 0,raw_efficiency
count,877.0
mean,6171.126
std,53327.07
min,0.0
25%,9.762
50%,65.0
75%,555.0
max,1037500.0


In [15]:
def convert_to_number(val):
    if pd.isna(val):
        return np.nan
    val = str(val).replace(' ', '')  # удаляем пробелы
    # обработка значений '<число'
    if val.startswith('<'):
        num = re.findall(r'<(\d+\.?\d*)', val)
        return float(num[0]) if num else np.nan
    # обработка значений с ±
    elif '±' in val:
        nums = re.findall(r'([\d\.]+)±([\d\.]+)', val)
        if nums:
            main, uncertainty = nums[0]
            return float(main)  # берём только среднее (первое число)
        else:
            return np.nan
    # обработка значений с '/'
    elif '/' in val:
        nums = re.findall(r'([\d\.]+)/([\d\.]+)', val)
        if nums:
            num1, num2 = nums[0]
            return (float(num1) + float(num2)) / 2
        else:
            return np.nan
    # пробуем просто преобразовать в число
    else:
        try:
            return float(val)
        except:
            return np.nan

In [16]:
df['raw_efficiency'] = df['raw_efficiency'].apply(convert_to_number)
df[['raw_efficiency']].describe()

Unnamed: 0,raw_efficiency
count,877.0
mean,6171.126
std,53327.07
min,0.0
25%,9.762
50%,65.0
75%,555.0
max,1037500.0


### Статистики по каждому типу аптейка:

In [17]:
print("cell_line counts:")
print(df['cell_line'].value_counts())

cell_line counts:
cell_line
HeLa cells                                  242
CHO cells                                    50
MCF7 cells                                   50
A549 cells                                   42
NIH-3T3 cells                                38
                                           ... 
Rat basophilic leukemia RBL-2H3 cells         1
MEF cells                                     1
K562 cells                                    1
Mouse macrophage-like cell line J774 A.6      1
J3T cells                                     1
Name: count, Length: 94, dtype: int64


In [None]:
print("uptake_type counts:")
print(df['uptake_type'].value_counts())

uptake_type counts:
uptake_type
Mean Fluorescence intensity               175
Fluorescence intensity                    137
Cellular uptake                            66
% Positive cells                           37
Cellular uptake (%)                        34
                                         ... 
pmol siRNA / mg protein                     1
C/M ratio of exendin-4 (uL/mg protein)      1
Corrected total cell fluorescence           1
Uptake amount (x10^3 pfu/ug protein)        1
Relative cellular uptake (%)                1
Name: count, Length: 76, dtype: int64


In [17]:
fig = px.box(df, x="uptake_type", y="raw_efficiency"
)

fig.show()

In [41]:
grouped = df.groupby('uptake_type')['raw_efficiency'].describe().sort_values(by='count', ascending=False)
pd.set_option('display.max_rows', None)
grouped

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
uptake_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Mean Fluorescence intensity,175.0,10587.063897,79415.854191,0.0,35.5,500.0,2355.5,1037500.0
Fluorescence intensity,137.0,13431.324489,80720.483403,0.0,19.0,155.0,1000.0,800000.0
Cellular uptake,66.0,1147.586212,2805.254584,0.1,0.9,3.2,650.0,15000.0
% Positive cells,37.0,50.725676,32.435945,6.0,21.0,41.5,77.0,99.0
% Absorbance relative to peptide 1 (405 nm),34.0,50.676471,25.770809,13.0,25.25,52.0,67.0,94.0
Cellular uptake (%),34.0,54.487647,31.943611,0.0,32.0,55.5,80.0,100.0
Relative Cellular uptake,28.0,419.267857,826.023535,0.0,1.65,15.45,212.5,2890.0
Relative fluorescence,27.0,23431.987407,115242.992168,0.0,31.0,175.0,2000.0,600000.0
pmol/mg protein Cellular uptake,25.0,1134.0,703.446871,150.0,500.0,1050.0,1700.0,2500.0
Relative Mean Fluorescence intensity (%),25.0,83.88,59.834717,12.0,33.0,75.0,100.0,220.0


In [42]:
pd.reset_option('display.max_rows')

### Попытка проверить на одинаковость Mean Fluorescence intensity и Fluorescence intensity, чтобы объединить в одну группу

In [18]:
def filter_by_uptake_type(df):
    """
    Фильтрует DataFrame по столбцу 'uptake_type', оставляя только строки
    со значениями 'Mean Fluorescence intensity' и 'Fluorescence intensity'.

    """
    filter_values = ['Mean Fluorescence intensity', 'Fluorescence intensity']
    filtered_df = df[df['uptake_type'].isin(filter_values)]
    return filtered_df

df_filtered = filter_by_uptake_type(df)

In [44]:
import pandas as pd
import scipy.stats as stats

def compare_uptake_groups(df, numeric_column='raw_efficiency'):
    """
    Функция фильтрует DataFrame по столбцу 'uptake_type' для двух категорий:
    'Mean Fluorescence intensity' и 'Fluorescence intensity', затем сравнивает
    распределения значений в указанном числовом столбце с помощью статистических тестов.
    
    Параметры:
      df: pd.DataFrame - исходный датафрейм.
      numeric_column: str - название столбца с числовыми значениями для сравнения.
      
    Вывод:
      Результаты тестов по проверке нормальности, гомогенности дисперсий и
      итоговый статистический тест (t-тест или Mann-Whitney U test).
    """
    # Фильтруем данные для двух категорий
    group1 = df[df['uptake_type'] == 'Mean Fluorescence intensity']
    group2 = df[df['uptake_type'] == 'Fluorescence intensity']
    
    # Преобразуем выбранный столбец в числовой тип данных
    group1_values = pd.to_numeric(group1[numeric_column], errors='coerce').dropna()
    group2_values = pd.to_numeric(group2[numeric_column], errors='coerce').dropna()
    
    print(f"Размер выборки 'Mean Fluorescence intensity': {len(group1_values)}")
    print(f"Размер выборки 'Fluorescence intensity': {len(group2_values)}")
    
    # 1. Проверяем нормальность распределения с помощью теста Шапиро‑Уилка
    stat1, p1 = stats.shapiro(group1_values)
    stat2, p2 = stats.shapiro(group2_values)
    print("\nТест Шапиро‑Уилка:")
    print(f"  'Mean Fluorescence intensity': W = {stat1:.3f}, p = {p1}")
    print(f"  'Fluorescence intensity'     : W = {stat2:.3f}, p = {p2}")
    
    # Если обе выборки удовлетворяют нормальности (p > 0.05)
    if p1 > 0.05 and p2 > 0.05:
        # 2. Проверяем однородность дисперсий с помощью теста Левена
        stat_levene, p_levene = stats.levene(group1_values, group2_values)
        print("\nТест Левена на однородность дисперсий:")
        print(f"  Статистика = {stat_levene:.3f}, p = {p_levene:.3f}")
        
        # Если дисперсии однородны
        if p_levene > 0.05:
            # Проводим стандартный t-тест для независимых выборок
            stat_t, p_t = stats.ttest_ind(group1_values, group2_values)
            print("\nIndependent t-test (при однородных дисперсиях):")
            print(f"  t = {stat_t:.3f}, p = {p_t:.3f}")
        else:
            # При неоднородных дисперсиях используем Welch's t-test
            stat_t, p_t = stats.ttest_ind(group1_values, group2_values, equal_var=False)
            print("\nWelch's t-test (при неоднородных дисперсиях):")
            print(f"  t = {stat_t:.3f}, p = {p_t:.3f}")
    else:
        # 3. Если хотя бы в одной выборке данные не удовлетворяют нормальности,
        # используем непараметрический тест Манна‑Уитни
        stat_mw, p_mw = stats.mannwhitneyu(group1_values, group2_values, alternative='two-sided')
        print("\nMann-Whitney U test (для не нормально распределённых данных):")
        print(f"  U = {stat_mw:.3f}, p = {p_mw:.3f}")

# Пример использования:
compare_uptake_groups(df, numeric_column='raw_efficiency')

Размер выборки 'Mean Fluorescence intensity': 175
Размер выборки 'Fluorescence intensity': 137

Тест Шапиро‑Уилка:
  'Mean Fluorescence intensity': W = 0.100, p = 4.004625608176505e-28
  'Fluorescence intensity'     : W = 0.155, p = 1.2607754104960055e-24

Mann-Whitney U test (для не нормально распределённых данных):
  U = 14207.000, p = 0.005


Отделим, почистим выбросы, посмотрим, сколько строк выходит

In [21]:
df_MFI = df[df['uptake_type'] == 'Mean Fluorescence intensity']
df_FI = df[df['uptake_type'] == 'Fluorescence intensity']

In [22]:
df_MFI

Unnamed: 0,id,sequence,extra_name,cpp_category,is_cpp,cpp_type,origin,id_uptake,peptide,uptake_type,...,Positive_AA,Negative_AA,MolWt,LogP,TPSA,HBD,HBA,RotBonds,Rings,Fsp3
17,2984,?A1a-ERLRRRI-S5-LCR-S5-HHST,,,True,,,1761.0,2984.0,Mean Fluorescence intensity,...,7,1,2830.107,-25.85065,1417.51,57,48,97,2,0.657658
18,2664,AAAWFW,,,True,,,813.0,2664.0,Mean Fluorescence intensity,...,0,0,750.857,1.57270,240.40,9,7,17,5,0.300000
33,2964,AEQNPIYWARYADWLFTTPLLLLDLALLVDADEGTCG,,,True,,,1710.0,2964.0,Mean Fluorescence intensity,...,1,6,4168.745,-9.01123,1597.95,55,53,127,9,0.585492
35,2689,AGSHRRL,,,True,,,845.0,2689.0,Mean Fluorescence intensity,...,3,0,795.904,-5.51086,410.63,16,12,26,1,0.625000
43,1078,AKKRRQRRR,,,False,,,9.0,1541.0,Mean Fluorescence intensity,...,7,0,1254.520,-8.86355,700.75,28,18,48,0,0.700000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868,3097,kRkFiCl,,,True,,,1959.0,3097.0,Mean Fluorescence intensity,...,1,0,907.197,-0.90693,351.86,14,12,32,1,0.666667
869,3095,kRkIfCl,,,True,,,1957.0,3095.0,Mean Fluorescence intensity,...,1,0,907.197,-0.90693,351.86,14,12,32,1,0.666667
870,2323,likk(Aib)lkalkklni,TAM-riMitP,,True,,,611.0,2323.0,Mean Fluorescence intensity,...,0,0,1691.273,-0.81730,623.68,21,22,65,0,0.804878
875,2714,stearyl-HHHHHHHHHHHHHHHH-RRRRRRRR-NH2,,,True,,,924.0,2714.0,Mean Fluorescence intensity,...,25,0,4769.194,-27.24737,2290.00,86,69,154,19,0.465000


In [23]:
def clean_dataframes(df_MFI, df_FI):
    # Очистка df_MFI
    Q1 = df_MFI['raw_efficiency'].quantile(0.25)
    Q3 = df_MFI['raw_efficiency'].quantile(0.75)
    IQR = Q3 - Q1
    df_MFI_clean = df_MFI[(df_MFI['raw_efficiency'] >= Q1 - 1.5*IQR) & 
                          (df_MFI['raw_efficiency'] <= Q3 + 1.5*IQR)]
    
    # Очистка df_FI
    Q1 = df_FI['raw_efficiency'].quantile(0.25)
    Q3 = df_FI['raw_efficiency'].quantile(0.75)
    IQR = Q3 - Q1
    df_FI_clean = df_FI[(df_FI['raw_efficiency'] >= Q1 - 1.5*IQR) & 
                        (df_FI['raw_efficiency'] <= Q3 + 1.5*IQR)]
    
    return df_MFI_clean, df_FI_clean

df_MFI_clean, df_FI_clean = clean_dataframes(df_MFI, df_FI)

In [25]:
df_MFI_clean

Unnamed: 0,id,sequence,extra_name,cpp_category,is_cpp,cpp_type,origin,id_uptake,peptide,uptake_type,...,Positive_AA,Negative_AA,MolWt,LogP,TPSA,HBD,HBA,RotBonds,Rings,Fsp3
17,2984,?A1a-ERLRRRI-S5-LCR-S5-HHST,,,True,,,1761.0,2984.0,Mean Fluorescence intensity,...,7,1,2830.107,-25.85065,1417.51,57,48,97,2,0.657658
33,2964,AEQNPIYWARYADWLFTTPLLLLDLALLVDADEGTCG,,,True,,,1710.0,2964.0,Mean Fluorescence intensity,...,1,6,4168.745,-9.01123,1597.95,55,53,127,9,0.585492
43,1078,AKKRRQRRR,,,False,,,9.0,1541.0,Mean Fluorescence intensity,...,7,0,1254.520,-8.86355,700.75,28,18,48,0,0.700000
69,2982,CF-GRR-Ac5cNH2-RR-Ac5cNH2-RR-Ac5cNH2-NH2,,,True,,,1759.0,2982.0,Mean Fluorescence intensity,...,10,0,3917.406,-29.07408,1864.73,72,61,127,9,0.493333
70,2981,CF-GRR-Ac6cNH2-RR-Ac6cNH2-RR-Ac6cNH2-NH2,,,True,,,1758.0,2981.0,Mean Fluorescence intensity,...,10,0,3917.406,-29.07408,1864.73,72,61,127,9,0.493333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868,3097,kRkFiCl,,,True,,,1959.0,3097.0,Mean Fluorescence intensity,...,1,0,907.197,-0.90693,351.86,14,12,32,1,0.666667
869,3095,kRkIfCl,,,True,,,1957.0,3095.0,Mean Fluorescence intensity,...,1,0,907.197,-0.90693,351.86,14,12,32,1,0.666667
870,2323,likk(Aib)lkalkklni,TAM-riMitP,,True,,,611.0,2323.0,Mean Fluorescence intensity,...,0,0,1691.273,-0.81730,623.68,21,22,65,0,0.804878
875,2714,stearyl-HHHHHHHHHHHHHHHH-RRRRRRRR-NH2,,,True,,,924.0,2714.0,Mean Fluorescence intensity,...,25,0,4769.194,-27.24737,2290.00,86,69,154,19,0.465000


In [27]:
df_FI_clean

Unnamed: 0,id,sequence,extra_name,cpp_category,is_cpp,cpp_type,origin,id_uptake,peptide,uptake_type,...,Positive_AA,Negative_AA,MolWt,LogP,TPSA,HBD,HBA,RotBonds,Rings,Fsp3
0,2721,(Acp)-KKKKKRFSFKKSFKLSGFSFKKNKK,,,True,,,938.0,2721.0,Fluorescence intensity,...,13,0,3106.855,-10.12793,1242.72,45,44,120,5,0.610738
1,2724,(Acp)-RKRRQTSMTDFYHSKRRLIFS,,,True,,,969.0,2724.0,Fluorescence intensity,...,8,1,2740.203,-13.31615,1220.16,47,39,95,4,0.583333
30,1076,ADVFDRGGPYLQRGVADLVPTATLLDTYSP,,,False,,,176.0,1537.0,Fluorescence intensity,...,2,4,3208.581,-12.95276,1318.32,45,44,96,6,0.618056
32,1499,AEKVDPVKLNLTLSAAAEALTGLGDK,Inv5,cpp,True,H,,172.0,1499.0,Fluorescence intensity,...,3,4,2625.020,-11.37540,1113.07,37,38,89,1,0.730435
36,3033,AGYLLGHEINLHEHELAHEL(Aib)HEHEIL-NH2,,,True,,,1833.0,3033.0,Fluorescence intensity,...,7,6,3725.111,-11.23150,1547.93,50,51,123,9,0.572289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
816,1462,VRLPPPVRLPPPVRLPPP,,cpp,True,,,291.0,1462.0,Fluorescence intensity,...,3,0,1997.514,-1.90819,664.61,19,22,47,9,0.781250
821,2845,VSRRRRRRGGRRRRGGGSYARVRRRGPRRGYARVRRRGPRR,,,True,,,1382.0,2845.0,Fluorescence intensity,...,22,0,5044.888,-37.46516,2652.46,110,68,176,4,0.630542
860,2594,YNNFAYSVFL,,,True,,,673.0,2594.0,Fluorescence intensity,...,0,0,1237.379,-2.39140,472.09,16,16,35,4,0.409836
861,2600,YQKQAKIMCS,,,True,,,679.0,2600.0,Fluorescence intensity,...,2,0,1199.466,-4.76000,503.90,18,19,42,1,0.647059
