## Получение результатов работы модели на данных генной экспрессии и рекомендательных систем.

Функция получения списков строк 

In [13]:
def get_biclusters_lists(matrix):
    biclusters_rows = []
    biclusters_cols = []
    for i in range(len(matrix)):
        for j in range(len(matrix[i])):
            if matrix[i][j] == 1:
                found = False
                for i_find in range(len(biclusters_rows)):
                    for j_find in range(len(biclusters_rows[i_find])):
                        if biclusters_rows[i_find][j_find] == i:
                            biclusters_cols[i_find].append(j)
                            found = True
                            break
                    if found:
                        break
                for i_find in range(len(biclusters_cols)):
                    for j_find in range(len(biclusters_cols[i_find])):
                        if biclusters_cols[i_find][j_find] == j:
                            biclusters_rows[i_find].append(i)
                            found = True
                            break
                    if found:
                        break
                if not found:
                    biclusters_rows.append([i])
                    biclusters_cols.append([j])
                
    for i in range(len(biclusters_rows)):
        biclusters_rows[i] = list(set(biclusters_rows[i])) 
        biclusters_cols[i] = list(set(biclusters_cols[i])) 
    
    return biclusters_rows, biclusters_cols

Функция получения целевой проблемы формирования производственных ячеек.

In [14]:
def get_global_objective(clusters_matrix, matrix):
    
    n1in_count = 0
    n0in_count = 0
    matrix = np.array(matrix)
    x_size, y_size = matrix.shape
    for i in range(x_size):
        for j in range(y_size):
            if clusters_matrix[i][j] == 1:
                if matrix[i][j] == 1:
                    n1in_count += 1
                else: 
                    n0in_count += 1
                
    n1_global = sum([matrix[i][j] for i in range(x_size) for j in range(y_size)])            
    return n1in_count/(n1_global + n0in_count)

Функция получения локальных целевых для бикластеров, тип зависит от переменной `can_overlap`.

In [15]:
def get_overlap_objectives(biclusters_rows, biclusters_cols, matrix, can_overlap=False):
    
    objectives_1d = []
    used_cols = set()
    used_rows = set()
    for i in range(len(biclusters_rows)):
        
        n1in_count = 0
        n0in_count = 0
        
        n1_global = 0
        
        all_rows = set(range(len(matrix)))
        all_cols = set(range(len(matrix[0])))
        for j in biclusters_cols[i]:
            active_rows = all_rows if can_overlap else (all_rows - used_rows) 
            for k in active_rows:
                if (k in biclusters_rows[i] and j in biclusters_cols[i] 
                    and k not in used_rows and j not in used_cols):
                        n1in_count += matrix[k][j]
                        n0in_count += 1 - matrix[k][j]
                n1_global += matrix[k][j]
                
        used_cols.update(biclusters_cols[i])
        for j in biclusters_rows[i]:
            active_cols = all_cols if can_overlap else (all_cols - used_cols) 
            for k in active_cols:
                if (j in biclusters_rows[i] and k in biclusters_cols[i] 
                    and j not in used_rows and k not in used_cols):
                        n1in_count += matrix[j][k]
                        n0in_count += 1 - matrix[j][k]
                n1_global += matrix[j][k]
                
        used_rows.update(biclusters_rows[i])
                
        objectives_1d.append(n1in_count/(n1_global + n0in_count))
        
    return objectives_1d

Получение списка датасетов.

In [16]:
import pandas as pd
import os
import numpy as np

directory = "./GEOsets/"

def getDatasetsList(path):
    GEO_files = [f for f in os.listdir(path) if f.endswith('.soft.gz')]
    return GEO_files           
                
datasets_files = []
GEO_files = getDatasetsList("./GEOsets/")
for current_file in GEO_files:
    datasets_files.append(current_file.split('.')[0])

Подсчет целевых всех типов по каждому результативному файлу с формированными бикластерами.

In [17]:
import gurobipy as gp
from gurobipy import GRB, quicksum
from IPython.display import clear_output
import matplotlib.pyplot as plt 
import seaborn as sns



datasets_objectives_1dir = []
datasets_objectives_2dir = []
datasets_objectives = []
datasets_types = []
cells_number = []
for current_file in datasets_files:
    current_files = [f for f in os.listdir('./cfp_original/') if f.startswith(current_file)]
    current_ds_types = []
    cur_objectives_1dir = []
    cur_objectives_2dir = []
    cur_objectives = []
    cur_cells_num = []
    for cur_ds_file in current_files:
    
        print(cur_ds_file)
        with open('./cfp_original/' + cur_ds_file) as f:
            matrix = [list(map(int, row.split("\t")[1:])) for row in f.readlines()[1:]
                        if sum(list(map(int, row.split("\t")[1:]))) > 0]
        with open('./cfp_solutions/' + cur_ds_file + "_gp") as f:
            clusters_matrix = [list(map(int, row.split("\t")[1:])) for row in f.readlines()[1:]]
        
        values_original_list = []
        flat_original_list = [item for sublist in clusters_matrix for item in sublist]
        values_original_list += flat_original_list
        
        if sum(values_original_list) == 0:
            
            cur_objectives_1dir.append([])
            cur_objectives_2dir.append([])
            cur_objectives.append(0)
            current_ds_types.append(cur_ds_file.replace(current_file, ""))
            
        else:
            biclusters_rows, biclusters_cols = get_biclusters_lists(clusters_matrix)
        
            objectives_1d = get_overlap_objectives(biclusters_rows, biclusters_cols, matrix)
            objectives_2d = get_overlap_objectives(biclusters_rows, biclusters_cols, matrix, True)
        
            cur_objectives_1dir.append(objectives_1d)
            cur_objectives_2dir.append(objectives_2d)
            cur_objectives.append(get_global_objective(clusters_matrix, matrix))
            cur_cells_num.append(len(biclusters_rows))
            current_ds_types.append(cur_ds_file.replace(current_file, ""))
        
    datasets_objectives_1dir.append(cur_objectives_1dir)
    datasets_objectives_2dir.append(cur_objectives_2dir)
    datasets_objectives.append(cur_objectives)
    datasets_types.append(current_ds_types)
    cells_number.append(cur_cells_num)

GDS1406less10
GDS1406less20
GDS1406less30
GDS1406more10
GDS1406more20
GDS1406more30
GDS1451less10
GDS1451less20
GDS1451less30
GDS1451more10
GDS1451more20
GDS1451more30
GDS3716less10
GDS3716less20
GDS3716less30
GDS3716more10
GDS3716more20
GDS3716more30


In [18]:
for i in range(len(datasets_types)):
    for j in range(len(datasets_types[i])):
        if len(datasets_objectives_1dir[i][j]) == 0:
            print("{file_name} solution not found!".format(file_name=datasets_files[i]+datasets_types[i][j]))
        else:
            print("{file_name} mean 1 direction objective: {obj:.6f} \
, max:{max_v:.6f}, min:{min_v:.6f}".format(file_name=datasets_files[i]+datasets_types[i][j], 
                                                                     obj=np.mean(datasets_objectives_1dir[i][j]),
                                          min_v=min(datasets_objectives_1dir[i][j]),
                                          max_v=max(datasets_objectives_1dir[i][j])))

GDS1406less10 mean 1 direction objective: 0.482018 , max:0.642608, min:0.321429
GDS1406less20 mean 1 direction objective: 0.473449 , max:0.677668, min:0.269231
GDS1406less30 mean 1 direction objective: 0.482730 , max:0.696228, min:0.269231
GDS1406more10 mean 1 direction objective: 0.616980 , max:1.000000, min:0.260870
GDS1406more20 mean 1 direction objective: 0.301280 , max:0.667279, min:0.075269
GDS1406more30 mean 1 direction objective: 0.323121 , max:0.697115, min:0.063291
GDS1451less10 mean 1 direction objective: 0.432230 , max:0.558904, min:0.305556
GDS1451less20 mean 1 direction objective: 0.382274 , max:0.700855, min:0.063694
GDS1451less30 mean 1 direction objective: 0.588344 , max:0.610022, min:0.566667
GDS1451more10 mean 1 direction objective: 0.293919 , max:0.504505, min:0.083333
GDS1451more20 mean 1 direction objective: 0.309944 , max:0.469304, min:0.210526
GDS1451more30 solution not found!
GDS3716less10 mean 1 direction objective: 0.462113 , max:0.813115, min:0.111111
GDS371

In [19]:
for i in range(len(datasets_types)):
    for j in range(len(datasets_types[i])):
        if len(datasets_objectives_2dir[i][j]) == 0:
            print("{file_name} solution not found!".format(file_name=datasets_files[i]+datasets_types[i][j]))
        else:
            print("{file_name} mean 2 direction objective: {obj:.6f} \
, max:{max_v:.6f}, min:{min_v:.6f}".format(file_name=datasets_files[i]+datasets_types[i][j], 
                                                                     obj=np.mean(datasets_objectives_2dir[i][j]),
                                          min_v=min(datasets_objectives_2dir[i][j]),
                                          max_v=max(datasets_objectives_2dir[i][j])))

GDS1406less10 mean 2 direction objective: 0.227749 , max:0.391212, min:0.064286
GDS1406less20 mean 2 direction objective: 0.228482 , max:0.403934, min:0.053030
GDS1406less30 mean 2 direction objective: 0.231744 , max:0.410457, min:0.053030
GDS1406more10 mean 2 direction objective: 0.189783 , max:0.371097, min:0.058252
GDS1406more20 mean 2 direction objective: 0.190268 , max:0.361914, min:0.070000
GDS1406more30 mean 2 direction objective: 0.195703 , max:0.358911, min:0.059524
GDS1451less10 mean 2 direction objective: 0.209317 , max:0.358524, min:0.060109
GDS1451less20 mean 2 direction objective: 0.216587 , max:0.373293, min:0.059880
GDS1451less30 mean 2 direction objective: 0.233948 , max:0.378890, min:0.089005
GDS1451more10 mean 2 direction objective: 0.179713 , max:0.335329, min:0.024096
GDS1451more20 mean 2 direction objective: 0.144993 , max:0.319406, min:0.050000
GDS1451more30 solution not found!
GDS3716less10 mean 2 direction objective: 0.235343 , max:0.448463, min:0.022222
GDS371

In [20]:
for i in range(len(datasets_types)):
    for j in range(len(datasets_types[i])):
        if datasets_objectives[i][j] == 0:
            print("{file_name} solution not found!".format(file_name=datasets_files[i]+datasets_types[i][j]))
        else:
            print("{file_name} objective: {obj:.6f}, with \
{cells_n} cells".format(file_name=datasets_files[i]+datasets_types[i][j],
                        obj=datasets_objectives[i][j], cells_n=cells_number[i][j]))

GDS1406less10 objective: 0.632469, with 2 cells
GDS1406less20 objective: 0.666310, with 2 cells
GDS1406less30 objective: 0.685204, with 2 cells
GDS1406more10 objective: 0.583673, with 3 cells
GDS1406more20 objective: 0.561377, with 3 cells
GDS1406more30 objective: 0.549822, with 3 cells
GDS1451less10 objective: 0.546997, with 2 cells
GDS1451less20 objective: 0.584400, with 2 cells
GDS1451less30 objective: 0.608650, with 2 cells
GDS1451more10 objective: 0.491885, with 2 cells
GDS1451more20 objective: 0.461640, with 3 cells
GDS1451more30 solution not found!
GDS3716less10 objective: 0.792994, with 2 cells
GDS3716less20 objective: 0.810241, with 2 cells
GDS3716less30 objective: 0.804035, with 2 cells
GDS3716more10 objective: 0.781145, with 2 cells
GDS3716more20 objective: 0.753846, with 2 cells
GDS3716more30 objective: 0.725490, with 2 cells
