In [1]:
# Import the libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

filename=r'..\DatasetEnglish.csv'
data=pd.read_csv(filename)
data=data.drop(['index'], axis=1)
data.head()

data_use=data[['talk_title','talk_description','speakers_name', 'topic_0_name','topic_1_name','topic_2_name','topic_3_name',
              'topic_4_name','topic_5_name','topic_6_name','topic_7_name','transcript','related_talk_1_slug',
               'related_talk_2_slug','related_talk_3_slug','related_talk_4_slug','related_talk_5_slug']]

In [2]:
# Define a function to get the movie title from the movie id
def get_title_from_index(index):
    return data[data.index == index]['talk_slug'].values[0]

# Define a function to get the movie id from the movie title
def get_index_from_title(title):
    return data[data.talk_slug == title].index.values[0]

# Create a TF-IDF vectorizer object
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with an empty string
#data['genres'] = data['genres'].fillna('')

# Combine the columns that are relevant for the content-based filtering
data['combined_features'] = data['talk_slug'] + ' ' + data['talk_description'] + ' ' + data['speakers_name'] + ' ' +data['topic_0_name'] + ' ' + data['transcript']
data['combined_features'] = data['combined_features'].fillna('')

In [3]:
# Construct the TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(data['combined_features'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [4]:
# Get the index of the movie that the user likes
talk_user_likes = data['talk_slug'].loc[1]
talk_index = get_index_from_title(talk_user_likes)

# Get the list of similar movies in descending order of similarity score
similar_talks = list(enumerate(cosine_sim[talk_index]))
similar_talks = sorted(similar_talks, key=lambda x: x[1], reverse=True)

# Print the titles of the first 10 movies
print('Talks similar to ' + talk_user_likes + ' are:\n')
for i in range(10):
    print(get_title_from_index(similar_talks[i][0]))

Talks similar to aala_el_khani_what_it_s_like_to_be_a_parent_in_a_war_zone are:

aala_el_khani_what_it_s_like_to_be_a_parent_in_a_war_zone
yuko_munakata_the_science_behind_how_parents_affect_child_development
helen_pearson_lessons_from_the_longest_study_on_human_development
georgette_mulheir_the_tragedy_of_orphanages
joel_baraka_the_board_game_getting_kids_excited_about_school
heejae_lim_the_most_powerful_yet_overlooked_resource_in_schools
david_miliband_the_refugee_crisis_is_a_test_of_our_character
melissa_fleming_let_s_help_refugees_thrive_not_just_survive
brian_sokol_what_photos_don_t_tell_you_about_the_refugee_experience
sonia_livingstone_parenting_in_the_digital_age


In [None]:
from surprise import Dataset, Reader, KNNWithMeans, accuracy
from surprise.model_selection import cross_validate

# Define a function to get the similarity score between two movies based on their titles
def get_similarity_score(title1, title2):
    id1 = get_id_from_title(title1)
    id2 = get_id_from_title(title2)
    index1 = data[data.movieId == id1].index.values[0]
    index2 = data[data.movieId == id2].index.values[0]
    return cosine_sim[index1][index2]

# Define a function to get the related talk slug from the movie title
def get_related_talk_slug(title, n):
    return data[data.title == title]['related_talk_' + str(n) + '_slug'].values[0]

# Define a function to get the topic name from the movie title
def get_topic_name(title, n):
    return data[data.title == title]['topic_' + str(n) + '_name'].values[0]

# Create a reader object with the rating scale
reader = Reader(rating_scale=(0.5, 5))

# Load the ratings data into a Surprise dataset
#ratings_data = Dataset.load_from_df(data[['userId', 'title', 'rating']], reader)

# Define a custom similarity measure that uses the cosine similarity between the movie titles
sim_options = {
    'name': 'custom',
    'user_based': False, # item-based
    'min_support': 1, # minimum number of common items
    'sim_func': get_similarity_score # custom similarity function
}

# Create a KNNWithMeans algorithm using the custom similarity measure
algo = KNNWithMeans(sim_options=sim_options)

# Perform 5-fold cross-validation and print the precision and recall for each fold
cross_validate(algo, ratings_data, measures=['precision', 'recall'], cv=5, verbose=True)

# Define a function to get the top n recommendations for a given user
def get_top_n_recommendations(user, n):
    # Get the list of all movie titles
    titles = data['title'].unique()
    # Get the list of movies that the user has already rated
    rated_movies = data[data['userId'] == user]['title'].unique()
    # Remove the rated movies from the list of all movie titles
    titles = [title for title in titles if title not in rated_movies]
    # Predict the ratings for the remaining movies
    predictions = [algo.predict(user, title) for title in titles]
    # Sort the predictions by the estimated rating in descending order
    predictions = sorted(predictions, key=lambda x: x.est, reverse=True)
    # Return the top n movie titles
    return [prediction.iid for prediction in predictions[:n]]

# Define a function to calculate the accuracy of the model predicting related talk slugs
def get_related_talk_slug_accuracy(user, n):
    # Get the top n recommendations for the user
    recommendations = get_top_n_recommendations(user, n)
    # Initialize the number of correct predictions and the total number of predictions
    correct = 0
    total = 0
    # For each recommendation, check if the related talk slug matches the actual one
    for recommendation in recommendations:
        # Get the related talk slug from the recommendation
        predicted_slug = get_related_talk_slug(recommendation, 1)
        # Get the actual related talk slug from the data
        actual_slug = data[data['title'] == recommendation]['related_talk_1_slug'].values[0]
        # Compare the predicted and actual slugs
        if predicted_slug == actual_slug:
            # If they match, increment the number of correct predictions
            correct += 1
        # Increment the total number of predictions
        total += 1
    # Return the accuracy as the ratio of correct predictions to total predictions
    return correct / total

# Define a function to calculate the accuracy of the model predicting topics assigned to it
def get_topic_name_accuracy(user, n):
    # Get the top n recommendations for the user
    recommendations = get_top_n_recommendations(user, n)
    # Initialize the number of correct predictions and the total number of predictions
    correct = 0
    total = 0
    # For each recommendation, check if the topic name matches the actual one
    for recommendation in recommendations:
        # Get the topic name from the recommendation
        predicted_topic = get_topic_name(recommendation, 0)
        # Get the actual topic name from the data
        actual_topic = data[data['title'] == recommendation]['topic_0_name'].values[0]
        # Compare the predicted and actual topics
        if predicted_topic == actual_topic:
            # If they match, increment the number of correct predictions
            correct += 1
        # Increment the total number of predictions
        total += 1
    # Return the accuracy as the ratio of correct predictions to total predictions
    return correct / total

# Test the accuracy functions for a sample user and n
user = 1
n = 10
print('The accuracy of the model predicting related talk slugs for user ' + str(user) + ' and n = ' + str(n) + ' is: ' + str(get_related_talk_slug_accuracy(user, n)))
print('The accuracy of the model predicting topics assigned to it for user ' + str(user) + ' and n = ' + str(n) + ' is: ' + str(get_topic_name_accuracy(user, n)))
