## Препроцессинг данных рекомендательных систем и запуск алгоритма бикластеризации с разными параметрами.

In [1]:
from biclustlib.algorithms import *
from sklearn.cluster import SpectralBiclustering
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

#### Получение данных.

In [2]:
# TODO: turn into function
df = pd.read_excel("jester-data-1.xls", names=range(1, 101), header=None, na_values=99).reset_index(drop=True)
df += 11
df = df.round().fillna(0).astype(int)

In [3]:
df.head(40)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
0,3,20,1,3,3,2,1,15,2,6,...,14,0,0,0,0,0,5,0,0,0
1,15,11,17,15,9,1,10,6,20,20,...,14,6,11,19,11,9,14,11,7,12
2,0,0,0,0,20,20,20,20,0,0,...,0,0,0,20,0,0,0,0,0,0
3,0,19,0,0,13,19,8,17,0,13,...,0,0,0,12,0,0,0,0,0,0
4,20,16,7,6,12,13,18,16,11,17,...,16,17,15,16,17,13,14,18,13,13
5,5,7,11,2,4,7,2,10,4,9,...,7,4,10,8,9,8,11,2,6,8
6,0,0,0,0,20,1,19,20,0,0,...,0,0,0,0,0,13,0,0,0,0
7,18,14,20,5,3,9,20,12,6,7,...,18,10,11,5,8,8,13,11,12,11
8,7,7,2,4,2,11,6,2,3,9,...,15,11,15,11,11,11,11,11,8,6
9,14,16,16,14,17,16,20,14,14,19,...,0,15,0,0,0,0,0,0,0,0


Создание пустой таблицы для записи проверенных параметров и результатов целевой с ними.

In [4]:
def create_empty_dataframe():
    index = pd.Index([], name="id", dtype=int)
    # specify column name and data type 
    columns = [('algorithm', str),
               ('row_prunning_threshold', float),
               ('col_prunning_threshold', float),
               ('number_of_clusters', int),
               ('objective', float)]
    # create the dataframe from a dict
    return pd.DataFrame({k: pd.Series(dtype=t) for k, t in columns})

Поиск параметров и вывод наилучших случаев с порогами.

In [5]:
def count_objective(matrix, y_train):
    
    result_matrix = (len(train),len(train.columns))
    result_matrix = np.zeros(result_matrix)
    
    for i in y_train.biclusters:
        for j in range(len(i.rows)):
            for k in range(len(i.cols)):
                result_matrix[i.rows[j]][i.cols[k]] = 1
    
    x_size, y_size = matrix.shape
    n1in_count = 0
    n0in_count = 0
    for i in range(x_size):
        for j in range(y_size):
            if result_matrix[i][j] == 1:
                if matrix[i][j] == 1:
                    n1in_count += 1
                else: 
                    n0in_count += 1
                    
    n1_global = sum([matrix[i][j] for i in range(x_size) for j in range(y_size)])
    
    return n1in_count/(n1_global + n0in_count)                
    

for threshold in range (11, 19):
    
    train = df.copy()
    train_cur = train.iloc[:20]
    matrix = train_cur.to_numpy()
    
    train[train <= threshold] = 0
    train[train > threshold] = 1
    result_df = create_empty_dataframe()

    for clusters_number in range(2,16,1):
    
        for row_prunning_threshold in range(5, 30, 5):
        
            for col_prunning_threshold in range(5, 30, 5):
        
                model_plaid = Plaid(clusters_number, row_prunning_threshold=row_prunning_threshold*0.01, 
                                col_prunning_threshold=col_prunning_threshold*0.01)
    
                y_train_plaid = model_plaid.run(train_cur)
    
                new_plaid_row = {'algorithm':'Plaid','row_prunning_threshold':row_prunning_threshold*0.01,
                             'col_prunning_threshold':col_prunning_threshold*0.01,
                             'number_of_clusters':clusters_number,'objective':count_objective(matrix, y_train_plaid)}
    
                result_df = result_df.append(new_plaid_row, ignore_index=True)
    
    max_id = result_df['objective'].idxmax()
    print("with threshold {threshold} max objective: {obj:.6f} with params bicl_numbers={b_n} \
 col_pr={col_pr} row_pr={row_pr}".format(threshold=threshold, obj=result_df.iloc[max_id].objective,
                                                    b_n=result_df.iloc[max_id].number_of_clusters,
                                                    col_pr=result_df.iloc[max_id].col_prunning_threshold,
                                                    row_pr=result_df.iloc[max_id].row_prunning_threshold))


with threshold 11 max objective: 0.438125 with params bicl_numbers=15  col_pr=0.05 row_pr=0.2
with threshold 12 max objective: 0.453913 with params bicl_numbers=15  col_pr=0.05 row_pr=0.25
with threshold 13 max objective: 0.502058 with params bicl_numbers=15  col_pr=0.05 row_pr=0.25
with threshold 14 max objective: 0.506201 with params bicl_numbers=11  col_pr=0.1 row_pr=0.15
with threshold 15 max objective: 0.520280 with params bicl_numbers=13  col_pr=0.25 row_pr=0.1
with threshold 16 max objective: 0.627306 with params bicl_numbers=13  col_pr=0.25 row_pr=0.2
with threshold 17 max objective: 0.598194 with params bicl_numbers=12  col_pr=0.05 row_pr=0.1
with threshold 18 max objective: 0.757692 with params bicl_numbers=15  col_pr=0.25 row_pr=0.25


In [12]:
train = df.copy()
train_cur = train.iloc[:20]
matrix = train_cur.to_numpy()

train_cur[train_cur <= 16] = 0
train_cur[train_cur > 16] = 1

In [13]:
train_cur.head(200)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,1,1,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,1,1,1,1,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,1,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,1,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


Запись данных в файл.

In [14]:
def write_matrix_to_textfile(a_matrix, file_to_write):

     with open(file_to_write, 'w') as f:
        i = 0
        len_columns, len_rows = a_matrix.shape
        if len_rows > 0:
            
            str_to_write = ""
            str_to_write += "o"
            for j in range(len_rows):
                str_to_write += '\t' + 'C'+ str(j+1)
            f.write(str_to_write + '\n')
            for current_row in range(0,len(a_matrix)):
                i += 1
                f.write('G' + str(i))
                for col in range(len(a_matrix[current_row])):
                    f.write('\t' + str(int(a_matrix[current_row][col])))
                f.write('\n')

In [15]:
matrix = train_cur.to_numpy()
matrix = matrix.astype(int)

In [16]:
write_matrix_to_textfile(train_cur.to_numpy(), "./recomendation_ds/jester_data")

Проверка процента перекрытия.

In [17]:
model_plaid = Plaid(13, row_prunning_threshold=0.25, 
                                col_prunning_threshold=0.2)
    
y_train_plaid = model_plaid.run(train_cur)

def biclusters_quality(biclusters):
    """
    Calculates the quality of the biclusters: the percentage of biclusters overlapping
    :param biclusters: list of biclusters
    :return: number from 0 to 1. Zero indicates perfect biclusters, while one indicates a full overlapping
    """
    similarity = [cluster.overlap(another) for i, cluster in enumerate(biclusters) for another in biclusters[i + 1:]]
    return np.mean(similarity)

print(f"Biclusters overlapping: {round(biclusters_quality(y_train_plaid.biclusters), 3)}")

Biclusters overlapping: 0.041
