Random Split 80/20

In [1]:
import pandas as pd
from surprise import Dataset, Reader, KNNWithMeans
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import accuracy
from sklearn.metrics import precision_score, recall_score, f1_score

# Load dataset
data = pd.read_csv('all_articles_files_names_merge_final3.csv', sep=',')

# Define the reader
reader = Reader(rating_scale=(1, data['article_count'].max()))

# Convert the DataFrame to the format expected by Surprise
dataset = Dataset.load_from_df(data[['id', 'gene', 'article_count']], reader=reader)

# Random split 80/20
trainset, testset = train_test_split(dataset, test_size=0.2)

# Define the parameter space for GridSearch
param_grid = {'k': [10, 20, 30]}

# Perform the grid search
grid_search = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=5)
grid_search.fit(dataset)

# Access the best found parameters
best_params = grid_search.best_params['rmse']

# Train the model with the training set using the best parameters
model = KNNWithMeans(k=best_params['k'])
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Calculate evaluation metrics
accuracy.rmse(predictions)
accuracy.mae(predictions)

# Retrieve predicted and actual ratings
predicted_ratings = [pred.est for pred in predictions]
true_ratings = [pred.r_ui for pred in predictions]

# Convert ratings to binary labels based on the threshold
threshold = 3.5
predicted_labels = [1 if rating >= threshold else 0 for rating in predicted_ratings]
true_labels = [1 if rating >= threshold else 0 for rating in true_ratings]

# Calculate precision, recall, and F1-score metrics
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

# Target author for recommendation
author_id = 5177

# Retrieve all genes mentioned by the target author
genes_target_author = data[data['id'] == author_id]['gene'].unique()

# Make predictions for genes not mentioned by the target author.
genes_not_mentioned = data[~data['gene'].isin(genes_target_author)]['gene'].unique()

predictions = []

for gene in genes_not_mentioned:
    prediction = model.predict(author_id, gene)
    predictions.append((author_id, gene, prediction.est))

# Sort the predictions in descending order
sorted_predictions = sorted(predictions, key=lambda x: x[2], reverse=True)

# Retrieve the author's name
author_name = data[data['id'] == author_id]['author/disease'].iloc[0]

# Display the top predictions
for author, gene, score in sorted_predictions:
    print(f'Author ID: {author}, Author Name: {author_name}, Gene: {gene}, Score: {score}')


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

Autism-Focused Split (Single Author per Test)

In [2]:
import pandas as pd
from surprise import Dataset, Reader, KNNWithMeans
from surprise.model_selection import GridSearchCV
from surprise import accuracy
from sklearn.metrics import precision_score, recall_score, f1_score

# Load dataset
data = pd.read_csv('all_articles_files_names_merge_final3.csv', sep=',')

# Define the reader
reader = Reader(rating_scale=(1, data['article_count'].max()))

# Convert the DataFrame to the format expected by Surprise
dataset = Dataset.load_from_df(data[['id', 'gene', 'article_count']], reader=reader)

# Filter examples where the disease is "Autistic disorder"
autistic_data = data[data['author/disease'].str.contains('Autistic disorder')]

# Filter examples where the disease is not "Autistic disorder"
non_autistic_data = data[~data['author/disease'].str.contains('Autistic disorder')]

# List to store the IDs of authors already used as test
test_author_ids = []

# List to store the training set data
train_data = []

# List to store the test set data for autistic disorder
testset_autistic = []

# Loop to select one example from each author with the disease "Autistic disorder" for testing and the rest for training
for index, row in autistic_data.iterrows():
    author_id = row['id']
    if author_id not in test_author_ids:
        test_author_ids.append(author_id)
        test_data = row[['id', 'gene', 'article_count']].tolist()
        test_data[2] = float(test_data[2])  # Convert the article_count to a float value
        # Add a placeholder for the fourth element (left empty)
        test_data.append(None)
        test_data_tuple = tuple(test_data)
        testset_autistic.append(test_data_tuple)
    else:
        train_data.append(row[['id', 'gene', 'article_count']].tolist())

# Combine non-autistic training data with the remaining autistic training data
train_data.extend(non_autistic_data[['id', 'gene', 'article_count']].values.tolist())

# Convert the training and test data to the appropriate Surprise sets
trainset = dataset.build_full_trainset()
trainset.raw_ratings = train_data
testset = dataset.construct_testset(testset_autistic)

# Define the parameter space for GridSearch
param_grid = {'k': [10, 20, 30]}

# Perform grid search with cross-validation
grid_search = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=5)
grid_search.fit(dataset)

# Access the best found parameters
best_params = grid_search.best_params['rmse']

# Train the model with the training set using the best parameters
model = KNNWithMeans(k=best_params['k'])
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Calculate evaluation metrics
accuracy.rmse(predictions)
accuracy.mae(predictions)

# Retrieve predicted and actual ratings
predicted_ratings = [pred.est for pred in predictions]
true_ratings = [pred.r_ui for pred in predictions]

# Convert ratings to binary labels based on the threshold
threshold = 3.5
predicted_labels = [1 if rating >= threshold else 0 for rating in predicted_ratings]
true_labels = [1 if rating >= threshold else 0 for rating in true_ratings]

# Calculate precision, recall, and F1-score metrics
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

# Target author for recommendation
author_id = 4817

# Retrieve all genes mentioned by the target author
genes_target_author = data[data['id'] == author_id]['gene'].unique()

# Make predictions for genes not mentioned by the target author.
genes_not_mentioned = data[~data['gene'].isin(genes_target_author)]['gene'].unique()

predictions = []

for gene in genes_not_mentioned:
    prediction = model.predict(author_id, gene)
    predictions.append((author_id, gene, prediction.est))

# Sort the predictions in descending order
sorted_predictions = sorted(predictions, key=lambda x: x[2], reverse=True)

# Retrieve the author's name
author_name = data[data['id'] == author_id]['author/disease'].iloc[0]

# Display the top predictions
for author, gene, score in sorted_predictions:
    print(f'Author ID: {author}, Author Name: {author_name}, Gene: {gene}, Score: {score}')


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

Disease-Based Split

In [3]:
import pandas as pd
from surprise import Dataset, Reader, KNNWithMeans
from surprise.model_selection import train_test_split
from surprise import accuracy
from sklearn.metrics import precision_score, recall_score, f1_score
from surprise.model_selection import GridSearchCV

# Load dataset
data = pd.read_csv('all_articles_files_names_merge_final4.csv', sep=',')

# Define the reader
reader = Reader(rating_scale=(1, data['n_articles'].max()))

# Convert the DataFrame to the format expected by Surprise
dataset = Dataset.load_from_df(data[['id', 'gene', 'n_articles']], reader=reader)

# Filter examples where the disease is "Autistic disorder" for the test set
test_data = data[data['author/disease'].str.contains('Autistic disorder')]

# Filter examples where the disease is not "Autistic disorder" for the training set
train_data = data[~data['author/disease'].str.contains('Autistic disorder')]

# Convert the training and test data to the appropriate Surprise sets
trainset = dataset.build_full_trainset()
testset = [(row['id'], row['gene'], row['n_articles']) for index, row in test_data.iterrows()]

# Define the parameter space for GridSearch
param_grid = {'k': [10, 20, 30]}

# Perform grid search with cross-validation
grid_search = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=5)
grid_search.fit(dataset)

# Access the best found parameters
best_params = grid_search.best_params['rmse']

# Train the model with the training set using the best parameters
model = KNNWithMeans(k=best_params['k'])
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Calculate evaluation metrics
accuracy.rmse(predictions)
accuracy.mae(predictions)

# Retrieve predicted and actual ratings
predicted_ratings = [pred.est for pred in predictions]
true_ratings = [pred.r_ui for pred in predictions]

# Convert ratings to binary labels based on the threshold
threshold = 3.5
predicted_labels = [1 if rating >= threshold else 0 for rating in predicted_ratings]
true_labels = [1 if rating >= threshold else 0 for rating in true_ratings]

# Calculate precision, recall, and F1-score metrics
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

# Target author for recommendation
author_id = 4817

# Retrieve all genes mentioned by the target author
genes_target_author = data[data['id'] == author_id]['gene'].unique()

# Make predictions for genes not mentioned by the target author.
genes_not_mentioned = data[~data['gene'].isin(genes_target_author)]['gene'].unique()

predictions = []

for gene in genes_not_mentioned:
    prediction = model.predict(author_id, gene)
    predictions.append((author_id, gene, prediction.est))

# Sort the predictions in descending order
sorted_predictions = sorted(predictions, key=lambda x: x[2], reverse=True)

# Retrieve the author's name
author_name = data[data['id'] == author_id]['author/disease'].iloc[0]

# Display the top predictions
for author, gene, score in sorted_predictions:
    print(f'Author ID: {author}, Author Name: {author_name}, Gene: {gene}, Score: {score}')


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi