Random Split 80/20

In [1]:
import pandas as pd
from surprise import Dataset, Reader, KNNWithMeans
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import accuracy
from sklearn.metrics import precision_score, recall_score, f1_score

# Load dataset
data = pd.read_csv('all_articles_files_names_merge_final7.csv', sep=',')

# Define the reader
reader = Reader(rating_scale=(0, 100))

# Convert the DataFrame to the format expected by Surprise
dataset = Dataset.load_from_df(data[['id', 'gene', 'article_count']], reader=reader)

# Random split 80/20
trainset, testset = train_test_split(dataset, test_size=0.2)

# Define the parameter space for GridSearch
param_grid = {'k': [10, 20, 30]}

# Perform the grid search
grid_search = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=5)
grid_search.fit(dataset)

# Access the best found parameters
best_params = grid_search.best_params['rmse']

# Train the model with the training set using the best parameters
model = KNNWithMeans(k=best_params['k'])
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Calculate evaluation metrics
accuracy.rmse(predictions)
accuracy.mae(predictions)

# Retrieve predicted and actual ratings
predicted_ratings = [pred.est for pred in predictions]
true_ratings = [pred.r_ui for pred in predictions]

# Convert ratings to binary labels based on the threshold
threshold = 3.5
predicted_labels = [1 if rating >= threshold else 0 for rating in predicted_ratings]
true_labels = [1 if rating >= threshold else 0 for rating in true_ratings]

# Calculate precision, recall, and F1-score metrics
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

# Target author for recommendation
author_id = 4817

# Retrieve all genes mentioned by the target author
genes_target_author = data[data['id'] == author_id]['gene'].unique()

# Make predictions for genes not mentioned by the target author.
genes_not_mentioned = data[~data['gene'].isin(genes_target_author)]['gene'].unique()

predictions = []

for gene in genes_not_mentioned:
    prediction = model.predict(author_id, gene)
    predictions.append((author_id, gene, prediction.est))

# Sort the predictions in descending order
sorted_predictions = sorted(predictions, key=lambda x: x[2], reverse=True)

# Retrieve the author's name
author_name = data[data['id'] == author_id]['author/disease'].iloc[0]

# Calculate scores based on the recommendation position
# The higher the position, the lower the score
total_recommendations = len(sorted_predictions)
scores = [(total_recommendations - i) / total_recommendations * 100 for i in range(total_recommendations)]

# Display normalized scores
for i, (author, gene, _) in enumerate(sorted_predictions):
    print(f'Recommendation {i+1}: Author ID: {author}, Author Name: {author_name}, Gene: {gene}, Score: {scores[i]:.2f}')


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

Recommendation 111: Author ID: 4817, Author Name: Liu S/Autism spectrum disorder, Gene: adhesion G protein-coupled receptor L3, Score: 58.49
Recommendation 112: Author ID: 4817, Author Name: Liu S/Autism spectrum disorder, Gene: zinc finger protein 292, Score: 58.11
Recommendation 113: Author ID: 4817, Author Name: Liu S/Autism spectrum disorder, Gene: placental growth factor, Score: 57.74
Recommendation 114: Author ID: 4817, Author Name: Liu S/Autism spectrum disorder, Gene: glutamate ionotropic receptor AMPA type subunit 1, Score: 57.36
Recommendation 115: Author ID: 4817, Author Name: Liu S/Autism spectrum disorder, Gene: glutamate metabotropic receptor 7, Score: 56.98
Recommendation 116: Author ID: 4817, Author Name: Liu S/Autism spectrum disorder, Gene: nuclear receptor corepressor 2, Score: 56.60
Recommendation 117: Author ID: 4817, Author Name: Liu S/Autism spectrum disorder, Gene: neuroligin 1, Score: 56.23
Recommendation 118: Author ID: 4817, Author Name: Liu S/Autism spectrum

In [2]:
best_k = best_params['k']
print(f'O valor de k selecionado é: {best_k}')

O valor de k selecionado é: 10


Autism-Focused Split (Single Author per Test)

In [3]:
import pandas as pd
from surprise import Dataset, Reader, KNNWithMeans
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import accuracy
from sklearn.metrics import precision_score, recall_score, f1_score

# Load dataset
data = pd.read_csv('all_articles_files_names_merge_final7.csv', sep=',')

# Keep one row for each author with the disease 'Autistic disorder' for testing
test_data_autistic = data[data['author/disease'] == 'Autistic disorder'].groupby('id').head(1)
train_data_autistic = data.drop(test_data_autistic.index)

# For the test dataset, keep only one example for each author with 'Autistic disorder'
# All examples for other diseases go to training
test_data_autistic = test_data_autistic.groupby('id').head(1)

# For authors without 'Autistic disorder', keep one example for testing and the rest for training
test_data_other = data[data['author/disease'] != 'Autistic disorder'].groupby('id').head(1)
train_data_other = data.drop(test_data_other.index)

# Combine the test data for 'Autistic disorder' and others
test_data = pd.concat([test_data_autistic, test_data_other])

# Define the reader
reader = Reader(rating_scale=(0, 100))

# Convert the modified DataFrames to the format expected by Surprise
train_dataset = Dataset.load_from_df(train_data_other[['id', 'gene', 'article_count']], reader=reader)
test_dataset = Dataset.load_from_df(test_data[['id', 'gene', 'article_count']], reader=reader)

# Random split 80/20 for the training dataset
trainset, _ = train_test_split(train_dataset, test_size=0.2)

# Define the parameter space for GridSearch
param_grid = {'k': [10, 20, 30]}

# Perform the grid search
grid_search = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=5)
grid_search.fit(train_dataset)

# Access the best found parameters
best_params = grid_search.best_params['rmse']

# Train the model with the training set using the best parameters
model = KNNWithMeans(k=best_params['k'])
model.fit(trainset)

# Make predictions on the test set
testset = test_dataset.build_full_trainset().build_testset()
if len(testset) == 0:
    print("Test set is empty. Please check the data and the test set creation process.")
else:
    predictions = model.test(testset)

    # Calculate evaluation metrics
    accuracy.rmse(predictions)
    accuracy.mae(predictions)

    # Retrieve predicted and actual ratings
    predicted_ratings = [pred.est for pred in predictions]
    true_ratings = [pred.r_ui for pred in predictions]

    # Convert ratings to binary labels based on the threshold
    threshold = 3.5
    predicted_labels = [1 if rating >= threshold else 0 for rating in predicted_ratings]
    true_labels = [1 if rating >= threshold else 0 for rating in true_ratings]

    # Calculate precision, recall, and F1-score metrics
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)

    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')


# Target author for recommendation
author_id = 4817

# Retrieve all genes mentioned by the target author
genes_target_author = data[data['id'] == author_id]['gene'].unique()

# Make predictions for genes not mentioned by the target author.
genes_not_mentioned = data[~data['gene'].isin(genes_target_author)]['gene'].unique()

predictions = []

for gene in genes_not_mentioned:
    prediction = model.predict(author_id, gene)
    predictions.append((author_id, gene, prediction.est))

# Sort the predictions in descending order
sorted_predictions = sorted(predictions, key=lambda x: x[2], reverse=True)

# Retrieve the author's name
author_name = data[data['id'] == author_id]['author/disease'].iloc[0]

# Calculate scores based on the recommendation position
# The higher the position, the lower the score
total_recommendations = len(sorted_predictions)
scores = [(total_recommendations - i) / total_recommendations * 100 for i in range(total_recommendations)]

# Display normalized scores
for i, (author, gene, _) in enumerate(sorted_predictions):
    print(f'Recommendation {i+1}: Author ID: {author}, Author Name: {author_name}, Gene: {gene}, Score: {scores[i]:.2f}')

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [4]:
best_k = best_params['k']
print(f'O valor de k selecionado é: {best_k}')

O valor de k selecionado é: 10


Disease-Based Split

In [5]:
import pandas as pd
from surprise import Dataset, Reader, KNNWithMeans
from surprise.model_selection import train_test_split
from surprise import accuracy
from sklearn.metrics import precision_score, recall_score, f1_score
from surprise.model_selection import GridSearchCV

# Load dataset
data = pd.read_csv('all_articles_files_names_merge_final6.csv', sep=',')

# Define the reader
reader = Reader(rating_scale=(0, 100))

# Convert the DataFrame to the format expected by Surprise
dataset = Dataset.load_from_df(data[['id', 'gene', 'n_articles']], reader=reader)

# Filter examples where the disease is "Autistic disorder" for the test set
test_data = data[data['author/disease'].str.contains('Autistic disorder')]

# Filter examples where the disease is not "Autistic disorder" for the training set
train_data = data[~data['author/disease'].str.contains('Autistic disorder')]

# Convert the training and test data to the appropriate Surprise sets
trainset = dataset.build_full_trainset()
testset = [(row['id'], row['gene'], row['n_articles']) for index, row in test_data.iterrows()]

# Define the parameter space for GridSearch
param_grid = {'k': [10, 20, 30]}

# Perform grid search with cross-validation
grid_search = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=5)
grid_search.fit(dataset)

# Access the best found parameters
best_params = grid_search.best_params['rmse']

# Train the model with the training set using the best parameters
model = KNNWithMeans(k=best_params['k'])
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Calculate evaluation metrics
accuracy.rmse(predictions)
accuracy.mae(predictions)

# Retrieve predicted and actual ratings
predicted_ratings = [pred.est for pred in predictions]
true_ratings = [pred.r_ui for pred in predictions]

# Convert ratings to binary labels based on the threshold
threshold = 3.5
predicted_labels = [1 if rating >= threshold else 0 for rating in predicted_ratings]
true_labels = [1 if rating >= threshold else 0 for rating in true_ratings]

# Calculate precision, recall, and F1-score metrics
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

# Target author for recommendation
author_id = 4817

# Retrieve all genes mentioned by the target author
genes_target_author = data[data['id'] == author_id]['gene'].unique()

# Make predictions for genes not mentioned by the target author.
genes_not_mentioned = data[~data['gene'].isin(genes_target_author)]['gene'].unique()

predictions = []

for gene in genes_not_mentioned:
    prediction = model.predict(author_id, gene)
    predictions.append((author_id, gene, prediction.est))

# Sort the predictions in descending order
sorted_predictions = sorted(predictions, key=lambda x: x[2], reverse=True)

# Retrieve the author's name
author_name = data[data['id'] == author_id]['author/disease'].iloc[0]

# Calculate scores based on the recommendation position
# The higher the position, the lower the score
total_recommendations = len(sorted_predictions)
scores = [(total_recommendations - i) / total_recommendations * 100 for i in range(total_recommendations)]

# Display normalized scores
for i, (author, gene, _) in enumerate(sorted_predictions):
    print(f'Recommendation {i+1}: Author ID: {author}, Author Name: {author_name}, Gene: {gene}, Score: {scores[i]:.2f}')


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [6]:
best_k = best_params['k']
print(f'O valor de k selecionado é: {best_k}')

O valor de k selecionado é: 30
