In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Importing the dataset
bigdf = pd.read_csv("megaGymDataset.csv")
bigdf = bigdf.rename(columns={'Unnamed: 0': 'index'})

<h1>Data analisys</h1>

In [None]:
# Some exercises has the same title - Should remove duplicates?
bigdf = bigdf.drop_duplicates('Title', keep='last')
bigdf['Title'].value_counts()

In [None]:
# Sorted bv level
bigdf['Level'].value_counts().plot.barh()

In [None]:
# sorted by type
bigdf['Type'].value_counts().plot.barh()

In [None]:
# sorted by bodypart
bigdf['BodyPart'].value_counts().plot.barh()

In [None]:
# top rated exercises
ratingSorted= bigdf.sort_values(by='Rating',ascending=False)
ratingSorted =ratingSorted.head(10)
ratingSorted

In [None]:
# Distribution of ratings

df_sorted = bigdf.sort_values(by="Rating")
plt.figure(figsize=(10, 6))
plt.hist(df_sorted["Rating"], bins=20, edgecolor="k", alpha=0.7)

plt.title("Distribution of Ratings")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.grid(False)
plt.xlim(df_sorted["Rating"].min(), df_sorted["Rating"].max())
plt.ylim(0, plt.gca().get_ylim()[1])
plt.show()

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from copy import deepcopy
import numpy as np

# Dataset for training. Converts all strings to categories (int) for KNN algorithm
x = deepcopy(bigdf)
x = x.drop(["Title"], axis = 1)
x = x.drop(["Desc"], axis = 1)
x = x.drop(["RatingDesc"], axis = 1)
x['Level'] = pd.factorize(x['Level'])[0]
x['Type'] = pd.factorize(x['Type'])[0]
x['BodyPart'] = pd.factorize(x['BodyPart'])[0]
x['Equipment'] = pd.factorize(x['Equipment'])[0]
x = x[x['Rating'].notna()]
x = x[x["Rating"] != 0]

# Values to be predicted
y = x["Rating"]
x = x.drop(["Rating"], axis = 1)

# Grid search to find the best parameters
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_neighbors': [3,5,7,9,11,13,15,17],
    'p': [1, 2]
}
grid_search = GridSearchCV(estimator=KNeighborsRegressor(), param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(x, y)
params = grid_search.best_params_

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=3)


# Training
knn = KNeighborsRegressor(n_neighbors = params['n_neighbors'], p = params["p"])
knn.fit(x_train, y_train)

pred = knn.predict(x_test)

print("Evaluating the prediction")
print(f"MAE: {mean_absolute_error(y_test, pred):.2f}")
print(f"MSE: {mean_squared_error(y_test, pred):.2f}")
print(f"R-squared (R^2): {r2_score(y_test, pred):.2f}")


# New X, all exercises without ratings
x = deepcopy(bigdf)

# Excluding exercises with ratings
x = x[x['Rating'].isin([0, np.nan])]
x = x.drop(["Rating"], axis = 1)
x = x.drop(["Title"], axis = 1)
x = x.drop(["Desc"], axis = 1)
x = x.drop(["RatingDesc"], axis = 1)
x['Level'] = pd.factorize(x['Level'])[0]
x['Type'] = pd.factorize(x['Type'])[0]
x['BodyPart'] = pd.factorize(x['BodyPart'])[0]
x['Equipment'] = pd.factorize(x['Equipment'])[0]

# Amount of nonvalues
print("Nonvalues before:",bigdf["Rating"].isna().sum())

# All exercises without ratings are given a predicted rating
for index, row in x.iterrows():
    rating = knn.predict([row]).round(decimals=1)
    bigdf.loc[bigdf['index'] == index, 'Rating'] = rating

# Nonvalues after prediction
print("Nonvalues after",bigdf["Rating"].isna().sum())

filtered_df = bigdf[bigdf["Rating"] == 0]
print(len(filtered_df))


In [None]:
# Ratings after KNN predicion

df_sorted = bigdf.sort_values(by="Rating")
plt.figure(figsize=(10, 6))
plt.hist(df_sorted["Rating"], bins=20, edgecolor="k", alpha=0.7)

plt.title("Distribution of Ratings")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.grid(False)
plt.xlim(df_sorted["Rating"].min(), df_sorted["Rating"].max())
plt.ylim(0, plt.gca().get_ylim()[1])
plt.show()

In [None]:
# Removing irrelevant columns
bigdf = bigdf.drop('RatingDesc', axis=1)
# Removing all rows containing nonvalues in description
bigdf = bigdf[bigdf['Desc'].notna()]
# Removing ID column
bigdf.pop(bigdf.columns[0])

# Dataset after preprocessing
clean_df = deepcopy(bigdf)

In [None]:
clean_df

In [None]:
# Smaller dataframe consisting of 120 exercises

smalldf = pd.read_csv("100_exercises.csv", sep=";", names = ["index", "Title"])
bigdf['Title'] = bigdf['Title'].str.lower()
smalldf['Title'] = smalldf['Title'].str.lower()
combined = pd.merge(bigdf, smalldf, on="Title", how='inner')

combined

<h1>Cosine similarity</h1>

In [None]:
# Prints the row of the given Title to find the index
print(bigdf[bigdf["Title"] == "bench press"])

In [None]:
# Merging columns for cosign similarity and dropping excess columns
bigdf["Merged"] = bigdf["Type"].astype(str) + '|' + \
  bigdf["BodyPart"].astype(str) + '|' + bigdf["Equipment"].astype(str) + '|' + \
  bigdf["Level"]

bigdf = bigdf.drop('Type', axis=1)
bigdf = bigdf.drop('BodyPart', axis=1)
bigdf = bigdf.drop('Equipment', axis=1)
bigdf = bigdf.drop('Level', axis=1)

# The merged columns
bigdf["Merged"]

In [None]:
# Converting values of the merged column into vectors

from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
count_matrix = count.fit_transform(bigdf.loc[:,"Merged"])

liste = count_matrix.toarray()

In [None]:
# Cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
sim_matrix = cosine_similarity(count_matrix, count_matrix)

In [None]:
#sim_matrix

In [None]:
# Resetting the index to avoid indexing errors and NAN values in recommender
# This makes the previous indexes invalid
# "drop" avoids adding the old index as a column
bigdf = bigdf.reset_index(drop = False)

In [None]:
def recommender(data_frame, exercise_id, sim_matrix):
    sim_df = pd.DataFrame(sim_matrix[exercise_id],
                         columns=["Similarity"])
    exercise_titles = data_frame.loc[:, "Title"]
    exercise_rec = pd.concat([sim_df, exercise_titles], axis = 1)
    exercise_rec = exercise_rec.sort_values(by="Similarity", ascending=False)
    
    return exercise_rec

In [None]:
# Prints the row of the given Title to find the index
print(bigdf[bigdf["Title"] == "bench press"])

In [None]:
# Exercises similar to bench press
df_by_cat = recommender(bigdf, 450, sim_matrix)
df_by_cat

<h1>TDIDF</h1>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [None]:
tfidf = TfidfVectorizer(stop_words="english")
overview_matrix = tfidf.fit_transform(bigdf["Desc"])
overview_matrix.shape

In [None]:
similarity_matrix = linear_kernel(overview_matrix, overview_matrix)
print(similarity_matrix[0:5,0:5])

In [None]:
mapping = pd.Series(bigdf.index, index = bigdf["Desc"])
mapping

In [None]:
def recommender_by_desc(exercise_input, df, similarity_matrix, mapping):
    exercise_index = mapping[exercise_input]
    if not isinstance(exercise_index, np.int64):
        exercise_index = exercise_index[0]
    similarity_score = list(enumerate(similarity_matrix[exercise_index]))
    score = [tup[1] for tup in similarity_score]
    exercise_indices = [i[0] for i in similarity_score]
    df2 = df["Title"].iloc[exercise_indices].to_frame()
    df2["Similarity"] = score
    return df2


In [None]:
df_by_desc = recommender_by_desc(bigdf["Desc"][450], bigdf, similarity_matrix, mapping)
df_by_desc.sort_values(by="Similarity", ascending=False).iloc[:10]

<h1>Combined recommender (Cosine similarity + TDIDF)</h1>

In [None]:
merged_df = df_by_cat.copy()
merged_df["Similarity"] = (df_by_cat["Similarity"] + df_by_desc["Similarity"]) / 2
merged_df = merged_df.sort_values(by=["Similarity"], ascending=False)
merged_df[0:10]

<h1>Constraint based recommender</h1>

In [None]:
# Dataframe without merged columns
df = deepcopy(clean_df)

def knowledge_based_rec(dataframe, type=None, bodypart=None, equipment=None, level=None):
    #return dataframe

    if type:
        dataframe = dataframe[dataframe["Type"] == type]
    if bodypart:
        dataframe = dataframe[dataframe["BodyPart"] == bodypart]
    if equipment:
        dataframe = dataframe[dataframe["Equipment"] == equipment]
    if level:
        dataframe = dataframe[dataframe["Level"] == level]

    recommendations = dataframe[["Title" , "Rating"]]
    return recommendations.sort_values(by="Rating", ascending=False).iloc[:10]

print(knowledge_based_rec(df, type="", bodypart="Chest", equipment="Barbell", level=""))

In [None]:
# SMALL DATAFRAME
df = deepcopy(combined)
def knowledge_based_rec(dataframe, type=None, bodypart=None, equipment=None, level=None):
    if type:
        dataframe = dataframe[df["Type"] == type]
    if bodypart:
        dataframe = dataframe[df["BodyPart"] == bodypart]
    if equipment:
        dataframe = dataframe[df["Equipment"] == equipment]
    if level:
        dataframe = dataframe[df["Level"] == level]

    recommendations = dataframe[["Title" , "Rating"]]
    return recommendations.sort_values(by="Rating", ascending=False).iloc[:10]

print(knowledge_based_rec(df, type="", bodypart="Chest", equipment="Barbell", level=""))

<h1>Knowledge based GUI</h1>

In [None]:
combined

In [None]:
import pandas as pd
import tkinter as tk
from tkinter import ttk

# Load your gym exercise data into a Pandas DataFrame
# Make sure to specify the correct encoding if you have special characters
df = deepcopy(combined)

# Create a tkinter GUI
root = tk.Tk()
root.title("Gym Exercise Recommender")

# Function to filter and display exercises
def recommend_exercises():
    selected_bodypart = bodypart_var.get()
    selected_level = level_var.get()
    selected_type = type_var.get()

    filtered_df = df[
        (df['BodyPart'] == selected_bodypart) &
        (df['Level'] == selected_level) &
        (df['Type'] == selected_type)
    ]

    exercise_list.delete(0, tk.END)  # Clear the listbox

    for i, title in enumerate(filtered_df['Title']):
        exercise_list.insert(tk.END, f'{i + 1}. {title}')

# Create and configure GUI elements
bodypart_label = ttk.Label(root, text="Select Body Part:")
bodypart_label.pack()
my_bps = [i for i in df["BodyPart"].unique()]
bodypart_var = ttk.Combobox(root, values=my_bps)
bodypart_var.pack()

level_label = ttk.Label(root, text="Select Level:")
level_label.pack()
my_levels = [i for i in df["Level"].unique()]
level_var = ttk.Combobox(root, values=my_levels)
level_var.pack()

type_label = ttk.Label(root, text="Select Type:")
type_label.pack()
my_types = [i for i in df["Type"].unique()]
type_var = ttk.Combobox(root, values=my_types)
type_var.pack()

recommend_button = ttk.Button(root, text="Recommend Exercises", command=recommend_exercises)
recommend_button.pack()

exercise_list = tk.Listbox(root)
exercise_list.pack()

#root.mainloop()

In [None]:
"""
import pandas as pd
import tkinter as tk
from tkinter import ttk

# Load your gym exercise data into a Pandas DataFrame
# Make sure to specify the correct encoding if you have special characters
df = deepcopy(clean_df)

# Create a tkinter GUI
root = tk.Tk()
root.title("Gym Exercise Recommender")

# Function to filter and display exercises
def recommend_exercises():
    selected_bodypart = bodypart_var.get()
    selected_level = level_var.get()
    selected_type = type_var.get()

    filtered_df = df[
        (df['BodyPart'] == selected_bodypart) &
        (df['Level'] == selected_level) &
        (df['Type'] == selected_type)
    ]

    exercise_list.delete(0, tk.END)  # Clear the listbox

    for i, title in enumerate(filtered_df['Title']):
        exercise_list.insert(tk.END, f'{i + 1}. {title}')

# Create and configure GUI elements
bodypart_label = ttk.Label(root, text="Select Body Part:")
bodypart_label.pack()
my_bps = [i for i in df["BodyPart"].unique()]
bodypart_var = ttk.Combobox(root, values=my_bps)
bodypart_var.pack()

level_label = ttk.Label(root, text="Select Level:")
level_label.pack()
my_levels = [i for i in df["Level"].unique()]
level_var = ttk.Combobox(root, values=my_levels)
level_var.pack()

type_label = ttk.Label(root, text="Select Type:")
type_label.pack()
my_types = [i for i in df["Type"].unique()]
type_var = ttk.Combobox(root, values=my_types)
type_var.pack()

recommend_button = ttk.Button(root, text="Recommend Exercises", command=recommend_exercises)
recommend_button.pack()

exercise_list = tk.Listbox(root)
#exercise_list.pack()

#root.mainloop() """

<h1>Collaborative</h1>

In [None]:
# Collaborative filtering
"""
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader

ratings_df = pd.read_csv("user_ratings.csv")
ratings = ratings_df.melt(id_vars=['Users'], var_name='Exercise', value_name='Rating')
ratings.to_csv("ratings2.csv")

ratings = pd.read_csv("ratings2.csv")
ratings = ratings.drop("Unnamed: 0", axis=1)
ratings['Exercise'] = pd.factorize(ratings['Exercise'])[0]

# Use ratings DataFrame for pivoting
ratings_full = ratings.pivot(index="Users", columns="Exercise", values="Rating")

# Now you can use ratings_full for further analysis
ratings_full"""

In [None]:
# Collaborative filtering
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader
import pandas as pd
import numpy as np

# Leser DataFrame
ratings_df = pd.read_csv("user_ratings.csv")

# Converting to the correct format
ratings = ratings_df.melt(id_vars=['Users'], var_name='Exercise', value_name='Rating')

# Factorization?
ratings['Exercise'] = pd.factorize(ratings['Exercise'])[0]
# Use ratings DataFrame for pivoting
ratings = ratings.fillna(0).astype(int)
# Training

reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(ratings[["Users", "Exercise", "Rating"]], reader)
trainset = data.build_full_trainset()

# Variables
user_rating = trainset.ur
item_rating = trainset.ir
num_users = trainset.n_users
num_items = trainset.n_items
num_ratings = trainset.n_ratings

print("Users", num_users)
print("Items", num_items)
print("Ratings", num_ratings)
density = num_ratings / (num_users * num_items) 
density = np.round(density, decimals=2)
sparsity = 1 - density
print("Density:", density*100, "%")
print("Sparsity:", sparsity*100, "%")


sim_options = {"name": "pearson",
               "user based": True,
               "shrinkage": 0}

rec = KNNBasic(sim_options=sim_options)
rec.fit(trainset)

user_sim_matrix = rec.sim
#user_sim_matrix
ratings

In [None]:
def userbased_cf(user):
    # CF fra Youtube
    # Leser DataFrame
    ratings2 = pd.read_csv("user_ratings.csv")

    #ratings = ratings.melt(id_vars=['Users'], var_name='Exercise', value_name='Rating')


    ratings2 = ratings2.drop(columns="Users")
    ratings2.index.name = "Users"
    ratings2.columns.name = "Exercises"
    ratings_norm = ratings2.copy()
    ratings_norm = ratings_norm.subtract(ratings2.mean(axis=1), axis = "rows")
    
    user_similarity = ratings_norm.T.corr()
    picked_user = user

    user_similarity.drop(index=picked_user, inplace=True)
    
    n = 10

    user_similarity_threshold = 0.3

    similar_users = user_similarity[user_similarity[picked_user]>user_similarity_threshold][picked_user].sort_values(ascending=False)[:n]
    picked_user_done = ratings_norm[ratings_norm.index == picked_user].dropna(axis=1, how="all")
    similar_user_exercises = ratings_norm[ratings_norm.index.isin(similar_users.index)].dropna(axis=1, how="all")
    similar_user_exercises.drop(picked_user_done.columns, axis=1, inplace=True, errors='ignore')

    # A dictionary to store item scores
    item_score = {}

    # Loop through items
    for i in similar_user_exercises.columns:
        # Get the ratings for movie i
        movie_rating = similar_user_exercises[i]
        # Create a variable to store the score
        total = 0
        # Create a variable to store the number of scores
        count = 0
        # Loop through similar users
        for u in similar_users.index:
            # If the movie has rating
            if pd.isna(movie_rating[u]) == False:
                # Score is the sum of user similarity score multiply by the movie rating
                score = similar_users[u] * movie_rating[u]
                # Add the score to the total score for the movie so far
                total += score
                # Add 1 to the count
                count +=1
        # Get the average score for the item
        item_score[i] = total / count

    # Convert dictionary to pandas dataframe
    item_score = pd.DataFrame(item_score.items(), columns=['exercise', 'exercise_score'])
    
    # Sort the movies by score
    ranked_item_score = item_score.sort_values(by="exercise_score", ascending=False)

    # Select top m movies
    m = 10
    """"
    # Average rating for the picked user
    avg_rating = ratings[ratings.index == picked_user].T.mean(skipna=True)[picked_user]

    # Print the average movie rating for user 1
    print(f'The average exercise rating for user {picked_user} is {avg_rating:.2f}')

    # Calcuate the predicted rating
    ranked_item_score['predicted_rating'] = ranked_item_score['exercise_score'] + avg_rating

    # Take a look at the data
    ranked_item_score.head(m)"""



    return(ranked_item_score.head(m))

In [None]:
userbased_cf(4)

In [None]:
# USER BASED + EVALUATION


#Import the train_test_split function
from sklearn.model_selection import train_test_split

#Assign X as the original ratings dataframe and y as the user_id column of ratings.
X = ratings.copy()
y = ratings["Users"]

#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify=y, random_state=42)

## Stratified random sampling: is a method to ensure that each sample it's likely to occur in after the split.
## random_state: 
r_matrix = X_train.pivot_table(values='Rating', index='Users', columns='Exercise')
r_matrix = r_matrix.fillna(0).astype(int)
print(r_matrix.head())

def cf_user_mean(user, exercise):
    
    #Check if movie_id exists in r_matrix
    if exercise in r_matrix:
        #Compute the mean of all the ratings given to the movie
        mean_rating = r_matrix[exercise].mean() 
    else:
        #Default to a rating of 3.0 in the absence of any information
        mean_rating = 5.0
    
    return mean_rating

#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    
    #Construct a list of user-movie tuples from the testing dataset
    id_pairs = zip(X_test['Users'], X_test['Exercise'])
    
    #Predict the rating for every user-movie tuple
    y_pred = np.array([cf_model(User, Exercise) for (User, Exercise) in id_pairs])
    
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_test['Rating'])
    
    #Return the final RMSE score
    return rmse(y_true, y_pred)

#Define the baseline model to always return 3.
def baseline(User, Exercise):
    return 5.0


print(score(baseline))
print(score(cf_user_mean))

<h1>Evaluation</h1>

In [None]:
#Evaluation

from eval_metrics import precision_recall_at_k
from surprise import SVD
from surprise import SVDpp
from surprise import KNNBaseline
from surprise import BaselineOnly
from surprise import NormalPredictor
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from tabulate import tabulate

ratings_df = pd.read_csv("user_ratings.csv")

# Converting to the correct format
ratings = ratings_df.melt(id_vars=['Users'], var_name='Exercise', value_name='Rating')

# Factorization?
ratings['Exercise'] = pd.factorize(ratings['Exercise'])[0]
# Use ratings DataFrame for pivoting
ratings = ratings.fillna(0).astype(int)
# Training

reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(ratings.iloc[:,0:3], reader)

trainset, testset = train_test_split(data, test_size=0.3)

table = []
rec_n = 0

recommenders = (SVD, SVDpp, KNNBasic, KNNBaseline, BaselineOnly, NormalPredictor)
titles = ("SVD", "SVD++", "KNN-Basic", "KNN-Baseline", "Baseline", "Random")

for rec in recommenders:
    rec_alg = rec()
    rec_alg.fit(trainset)
    predictions = rec_alg.test(testset)

    p, r = precision_recall_at_k(predictions, k=4, threshold=4)
    precision = "{:.3f}".format(p)
    recall = "{:.3f}".format(r)

    new_line = [titles[rec_n], precision, recall]
    table.append(new_line)
    rec_n += 1

header = ["recommenders", "precision", "recall"]
print(tabulate(table, header, tablefmt="pipe"))
#print(ratings.head(20))

In [None]:
ratings_df = pd.read_csv("user_ratings.csv")

# Converting to the correct format
ratings = ratings_df.melt(id_vars=['Users'], var_name='Exercise', value_name='Rating')

# Factorization?
ratings['Exercise'] = pd.factorize(ratings['Exercise'])[0]
# Use ratings DataFrame for pivoting
ratings = ratings.fillna(0).astype(int)
# Training

reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(ratings.iloc[:,0:3], reader)

kf = KFold(random_state=0)

table = []
fold_n = 0
for rec in recommenders:
    out = cross_validate(rec(), data, ["rmse", "mae", "fcp",], kf)
    mean_rmse = "{:.3f}".format(np.mean(out["test_rmse"]))
    mean_mae = "{:.3f}".format(np.mean(out["test_mae"]))
    mean_fcp = "{:.3f}".format(np.mean(out["test_fcp"]))
    fit_time = "{:.3f}".format(np.mean(out["fit_time"]))

    new_line = [titles[fold_n], mean_rmse, mean_mae, mean_fcp, fit_time]
    table.append(new_line)
    fold_n += 1

In [None]:
header = ["Recommenders", "Pred Accuracy [RMSE]", "Pred Accuracy [MAE]", "Pred Accuracy [FCP]", "Training Time [sec]"]
print(tabulate(table, header, tablefmt="pipe"))