Random Split 80/20

In [1]:
import pandas as pd
from surprise import Dataset, Reader, KNNWithMeans
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import accuracy
from sklearn.metrics import precision_score, recall_score, f1_score

#Load dataset
#Make sure to replace 'all_articles_files_names_merge_final3.csv' with the correct path to your CSV file.
data = pd.read_csv('all_articles_files_names_merge_final3.csv', sep=',')

# Define the reader
reader = Reader(rating_scale=(1, data['article_count'].max()))

# Convert the DataFrame to the format expected by Surprise
dataset = Dataset.load_from_df(data[['id', 'gene', 'article_count']], reader=reader)

# Random split 80/20
trainset, testset = train_test_split(dataset, test_size=0.2)

# Define the parameter space for GridSearch
param_grid = {'k': [10, 20, 30]}

# Perform the grid search
grid_search = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=5)
grid_search.fit(dataset)

# Access the best found parameters
best_params = grid_search.best_params['rmse']

# Train the model with the training set using the best parameters
model = KNNWithMeans(k=best_params['k'])
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Calculate evaluation metrics
accuracy.rmse(predictions)
accuracy.mae(predictions)

# Retrieve predicted and actual ratings
predicted_ratings = [pred.est for pred in predictions]
true_ratings = [pred.r_ui for pred in predictions]

# Convert ratings to binary labels based on the threshold
threshold = 3.5
predicted_labels = [1 if rating >= threshold else 0 for rating in predicted_ratings]
true_labels = [1 if rating >= threshold else 0 for rating in true_ratings]

# Calculate precision, recall, and F1-score metrics
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

# Target author for recommendation
autor_id = 5177

# Retrieve all genes mentioned by the target author
genes_autor_alvo = data[data['id'] == autor_id]['gene'].unique()

# Make predictions for genes not mentioned by the target author.
genes_nao_mencionados = data[~data['gene'].isin(genes_autor_alvo)]['gene'].unique()

previsoes = []

for gene in genes_nao_mencionados:
    predicao = model.predict(autor_id, gene)
    previsoes.append((autor_id, gene, predicao.est))

# Sort the predictions in descending order
previsoes_ordenadas = sorted(previsoes, key=lambda x: x[2], reverse=True)

# Retrieve the author's name
autor_nome = data[data['id'] == autor_id]['author/disease'].iloc[0]

# Display the top predictions
for autor, gene, pontuacao in previsoes_ordenadas:
    print(f'ID from author: {autor}, author name: {autor_nome}, Gene: {gene}, Score: {pontuacao}')


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

Autism-Focused Split (Single Author per Test)

In [2]:
import pandas as pd
from surprise import Dataset, Reader, KNNWithMeans
from surprise.model_selection import GridSearchCV
from surprise import accuracy
from sklearn.metrics import precision_score, recall_score, f1_score

# Carregar os dados do dataset
# Certifique-se de substituir 'all_articles_files_names_merge.csv' pelo caminho correto do seu arquivo CSV
data = pd.read_csv('all_articles_files_names_merge_final3.csv', sep=',')

# Definir o leitor
reader = Reader(rating_scale=(1, data['article_count'].max()))

# Converter o DataFrame para o formato esperado pelo Surprise
dataset = Dataset.load_from_df(data[['id', 'gene', 'article_count']], reader=reader)

# Filtrar os exemplos onde a doença é "Autistic disorder"
autistic_data = data[data['author/disease'].str.contains('Autistic disorder')]

# Filtrar os exemplos onde a doença não é "Autistic disorder"
non_autistic_data = data[~data['author/disease'].str.contains('Autistic disorder')]

# Lista para armazenar os IDs dos autores já utilizados como teste
test_author_ids = []

# Lista para armazenar os dados do conjunto de treino
train_data = []

# Lista para armazenar os dados do conjunto de teste
testset_autistic = []

# Loop para selecionar um exemplo de cada autor com a doença "Autistic disorder" para teste e o resto para treino
for index, row in autistic_data.iterrows():
    author_id = row['id']
    if author_id not in test_author_ids:
        test_author_ids.append(author_id)
        test_data = row[['id', 'gene', 'article_count']].tolist()
        test_data[2] = float(test_data[2])  # Convert the article_count to a float value
        # Add a placeholder for the fourth element (left empty)
        test_data.append(None)
        test_data_tuple = tuple(test_data)
        testset_autistic.append(test_data_tuple)
    else:
        train_data.append(row[['id', 'gene', 'article_count']].tolist())

# Combinar os dados de treino não-autistas com o restante dos dados de treino autistas
train_data.extend(non_autistic_data[['id', 'gene', 'article_count']].values.tolist())

# Converter os dados de treino e teste para os conjuntos apropriados do Surprise
trainset = dataset.build_full_trainset()
trainset.raw_ratings = train_data
testset = dataset.construct_testset(testset_autistic)

# Definir o espaço de parâmetros para o GridSearch
param_grid = {'k': [10, 20, 30]}

# Realizar a busca em grid com cross-validation
grid_search = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=5)
grid_search.fit(dataset)

# Acessar os melhores parâmetros encontrados
best_params = grid_search.best_params['rmse']

# Treinar o modelo com o conjunto de treino usando os melhores parâmetros
model = KNNWithMeans(k=best_params['k'])
model.fit(trainset)

# Fazer previsões no conjunto de teste
predictions = model.test(testset)

# Calcular as métricas de avaliação
accuracy.rmse(predictions)
accuracy.mae(predictions)

# Obter as classificações previstas e reais
predicted_ratings = [pred.est for pred in predictions]
true_ratings = [pred.r_ui for pred in predictions]

# Converter as classificações em rótulos binários com base no limiar
threshold = 3.5
predicted_labels = [1 if rating >= threshold else 0 for rating in predicted_ratings]
true_labels = [1 if rating >= threshold else 0 for rating in true_ratings]

# Calcular as métricas de precisão, recall e F1-score
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

# Autor alvo para recomendação
autor_id = 4817

# Obter todos os genes mencionados pelo autor alvo
genes_autor_alvo = data[data['id'] == autor_id]['gene'].unique()

# Realizar as previsões para os genes não mencionados pelo autor alvo
genes_nao_mencionados = data[~data['gene'].isin(genes_autor_alvo)]['gene'].unique()

previsoes = []

for gene in genes_nao_mencionados:
    predicao = model.predict(autor_id, gene)
    previsoes.append((autor_id, gene, predicao.est))

# Ordenar as previsões em ordem decrescente
previsoes_ordenadas = sorted(previsoes, key=lambda x: x[2], reverse=True)

# Obter o nome do autor
autor_nome = data[data['id'] == autor_id]['author/disease'].iloc[0]

# Exibir as melhores previsões
for autor, gene, pontuacao in previsoes_ordenadas:
    print(f'ID do Autor: {autor}, Nome do Autor: {autor_nome}, Gene: {gene}, Pontuação: {pontuacao}')


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

Disease-Based Split

In [3]:
import pandas as pd
from surprise import Dataset, Reader, KNNWithMeans
from surprise.model_selection import train_test_split
from surprise import accuracy
from sklearn.metrics import precision_score, recall_score, f1_score
from surprise.model_selection import GridSearchCV


# Carregar os dados do dataset
# Certifique-se de substituir 'all_articles_files_names_merge.csv' pelo caminho correto do seu arquivo CSV
data = pd.read_csv('all_articles_files_names_merge_final4.csv', sep=',')

# Definir o leitor
reader = Reader(rating_scale=(1, data['n_articles'].max()))

# Converter o DataFrame para o formato esperado pelo Surprise
dataset = Dataset.load_from_df(data[['id', 'gene', 'n_articles']], reader=reader)

# Filtrar os exemplos onde a doença é "Autistic disorder" para o conjunto de teste
test_data = data[data['author/disease'].str.contains('Autistic disorder')]

# Filtrar os exemplos onde a doença não é "Autistic disorder" para o conjunto de treino
train_data = data[~data['author/disease'].str.contains('Autistic disorder')]

# Converter os dados de treino e teste para os conjuntos apropriados do Surprise
trainset = dataset.build_full_trainset()
testset = [(row['id'], row['gene'], row['n_articles']) for index, row in test_data.iterrows()]


# Definir o espaço de parâmetros para o GridSearch
param_grid = {'k': [10, 20, 30]}

# Realizar a busca em grid com cross-validation
grid_search = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=5)
grid_search.fit(dataset)

# Acessar os melhores parâmetros encontrados
best_params = grid_search.best_params['rmse']

# Treinar o modelo com o conjunto de treino usando os melhores parâmetros
model = KNNWithMeans(k=best_params['k'])
model.fit(trainset)

# Fazer previsões no conjunto de teste
predictions = model.test(testset)

# Calcular as métricas de avaliação
accuracy.rmse(predictions)
accuracy.mae(predictions)

# Obter as classificações previstas e reais
predicted_ratings = [pred.est for pred in predictions]
true_ratings = [pred.r_ui for pred in predictions]

# Converter as classificações em rótulos binários com base no limiar
threshold = 3.5
predicted_labels = [1 if rating >= threshold else 0 for rating in predicted_ratings]
true_labels = [1 if rating >= threshold else 0 for rating in true_ratings]

# Calcular as métricas de precisão, recall e F1-score
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

# Autor alvo para recomendação
autor_id = 4817

# Obter todos os genes mencionados pelo autor alvo
genes_autor_alvo = data[data['id'] == autor_id]['gene'].unique()

# Realizar as previsões para os genes não mencionados pelo autor alvo
genes_nao_mencionados = data[~data['gene'].isin(genes_autor_alvo)]['gene'].unique()

previsoes = []

for gene in genes_nao_mencionados:
    predicao = model.predict(autor_id, gene)
    previsoes.append((autor_id, gene, predicao.est))

# Ordenar as previsões em ordem decrescente
previsoes_ordenadas = sorted(previsoes, key=lambda x: x[2], reverse=True)

# Obter o nome do autor
autor_nome = data[data['id'] == autor_id]['author/disease'].iloc[0]

# Exibir as melhores previsões
for autor, gene, pontuacao in previsoes_ordenadas:
    print(f'ID do Autor: {autor}, Nome do Autor: {autor_nome}, Gene: {gene}, Pontuação: {pontuacao}')


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi