In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Load Dataset
df = pd.read_csv("Coursera.csv")
df.head(3)

Unnamed: 0,Course Name,University,Difficulty Level,Course Rating,Course URL,Course Description,Skills
0,Write A Feature Length Screenplay For Film Or ...,Michigan State University,Beginner,4.8,https://www.coursera.org/learn/write-a-feature...,Write a Full Length Feature Film Script In th...,Drama Comedy peering screenwriting film D...
1,Business Strategy: Business Model Canvas Analy...,Coursera Project Network,Beginner,4.8,https://www.coursera.org/learn/canvas-analysis...,"By the end of this guided project, you will be...",Finance business plan persona (user experien...
2,Silicon Thin Film Solar Cells,�cole Polytechnique,Advanced,4.1,https://www.coursera.org/learn/silicon-thin-fi...,This course consists of a general presentation...,chemistry physics Solar Energy film lambda...


In [3]:
#Checking for Null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3522 entries, 0 to 3521
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Course Name         3522 non-null   object
 1   University          3522 non-null   object
 2   Difficulty Level    3522 non-null   object
 3   Course Rating       3522 non-null   object
 4   Course URL          3522 non-null   object
 5   Course Description  3522 non-null   object
 6   Skills              3522 non-null   object
dtypes: object(7)
memory usage: 192.7+ KB


In [4]:
df['Course Description'].head(3)

0    Write a Full Length Feature Film Script  In th...
1    By the end of this guided project, you will be...
2    This course consists of a general presentation...
Name: Course Description, dtype: object

In [5]:
df['Skills'].head(3)

0    Drama  Comedy  peering  screenwriting  film  D...
1    Finance  business plan  persona (user experien...
2    chemistry  physics  Solar Energy  film  lambda...
Name: Skills, dtype: object

The content of both of the above columns are in plain text and are therefore not directly compareable with each other. We can use the Term Frequency-Inverse Document Frequency (TF-IDF) to score words based on how often they appear in a given course based on how often it appears in other courses.

In [6]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['Course Description'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(3522, 20074)

In [7]:
row_array = tfidf_matrix.getrow(1).toarray()
row_array

array([[0., 0., 0., ..., 0., 0., 0.]])

In [8]:
%%time
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

CPU times: total: 281 ms
Wall time: 294 ms


In [9]:
cosine_sim

array([[1.00000000e+00, 3.12366523e-02, 1.97603991e-02, ...,
        3.17538002e-02, 3.33859933e-02, 1.96231367e-02],
       [3.12366523e-02, 1.00000000e+00, 8.58915185e-03, ...,
        3.13671991e-02, 4.88239107e-03, 4.56033552e-02],
       [1.97603991e-02, 8.58915185e-03, 1.00000000e+00, ...,
        3.45669421e-03, 1.65197252e-02, 6.37237740e-03],
       ...,
       [3.17538002e-02, 3.13671991e-02, 3.45669421e-03, ...,
        1.00000000e+00, 5.07544593e-04, 6.72367274e-03],
       [3.33859933e-02, 4.88239107e-03, 1.65197252e-02, ...,
        5.07544593e-04, 1.00000000e+00, 1.14068789e-03],
       [1.96231367e-02, 4.56033552e-02, 6.37237740e-03, ...,
        6.72367274e-03, 1.14068789e-03, 1.00000000e+00]])

In [10]:
cosine_sim.shape

(3522, 3522)

This matrix is symetric in the sense that cosine_sim[0, 1] tell us how much the first movie (index 0) is similar to the second movie (index 1), which returns the exact same value as cosine_sim[1, 0], since that would compare the same two movies. We will test this in the two lines below

In [11]:
cosine_sim[0, 1]

np.float64(0.0312366522978012)

In [12]:
cosine_sim[1, 0]

np.float64(0.0312366522978012)

To see if the corresponding courses in the dataset, we will use a reverse map of index to course names.

In [13]:
#Construct a reverse map of indices and Course Names
indices = pd.Series(df.index, index=df['Course Name']).drop_duplicates()
indices[0:10]

Course Name
Write A Feature Length Screenplay For Film Or Television                                         0
Business Strategy: Business Model Canvas Analysis with Miro                                      1
Silicon Thin Film Solar Cells                                                                    2
Finance for Managers                                                                             3
Retrieve Data using Single-Table SQL Queries                                                     4
Building Test Automation Framework using Selenium and TestNG                                     5
Doing Business in China Capstone                                                                 6
Programming Languages, Part A                                                                    7
The Roles and Responsibilities of Nonprofit Boards of Directors within the Governance Process    8
Business Russian Communication. Part 3                                                           

Here we can see that the similarity cosine_sim[0,1] is the similarity between 
'Write A Feature Length Screenplay For Film Or Television' and 'Business Strategy: Business Model Canvas Analysis with Miro    '

Defining a recommender function that takes in a course description and returns the ten most similar courses

In [14]:
# Function that takes in Course Name as input and outputs most similar courses
def get_recommendations(CourseName, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[CourseName]

    # Get the pairwsie similarity scores of all courses with that course
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the courses based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar courses
    sim_scores = sim_scores[1:11]

    # Get the course indices
    course_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['Course Name'].iloc[course_indices]

In [15]:
get_recommendations('Finance for Managers')

1839    Fundamentals of financial and management accou...
1891          Accounting and Finance for IT professionals
1985                  Introduction to Finance: The Basics
419                    Finance for Non-Financial Managers
1164                         Corporate Finance Essentials
708     Understanding Financial Statements: Company Po...
1090                    Financial Accounting Fundamentals
590                Corporate finance: Know your numbers 2
3119    Introduction to Finance: The Role of Financial...
3463    Operations Management: Analysis and Improvemen...
Name: Course Name, dtype: object

In [16]:
get_recommendations('Write A Feature Length Screenplay For Film Or Television')

1481    Script Writing: Write a Pilot Episode for a TV...
1629                               Write Your First Novel
3481                                   Transmedia Writing
2186         Presentation skills: Public Speaking Project
3445                   Better Business Writing in English
3384              English for Effective  Business Writing
2894    Automating Team Communication with Google Shee...
614                      Writing in English at University
2732    Writing Professional Email and Memos (Project-...
104                                      Business Writing
Name: Course Name, dtype: object

Now to do the same for skills we will initialize a tfidf for the skills column

In [17]:
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix_skills = tfidf.fit_transform(df['Skills'])

#Output the shape of tfidf_matrix
tfidf_matrix_skills.shape

(3522, 4337)

In [18]:
%%time
cosine_sim_skills = cosine_similarity(tfidf_matrix_skills, tfidf_matrix_skills)

CPU times: total: 172 ms
Wall time: 172 ms


In [19]:
cosine_sim_skills.shape

(3522, 3522)

In [20]:
# Function that takes in Course Name as input and outputs most similar courses
def get_recommendations_skills(Skills, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[Skills]

    # Get the pairwsie similarity scores of all courses with that course
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the courses based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar courses
    sim_scores = sim_scores[1:11]

    # Get the course indices
    skill_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['Skills'].iloc[skill_indices]

In [21]:
get_recommendations_skills('Finance for Managers')

1839    Management Accounting  contribution margin  ev...
1891    capital budgeting  future value  net income  A...
1985    mortgage loan  financial statement  asset mana...
419     balance sheet  income statement  income  prese...
1164    Economics  Finance  investment  evaluation  Ec...
708     fundamental analysis  asset  balance sheet  cu...
1090    Finance  cash flow statement  balance sheet  i...
590     capital adequacy ratio  numbers (spreadsheet) ...
3119    capital budgeting  common stock  cash flow  pr...
3463    Inventory Management  Trigonometric Integral  ...
Name: Skills, dtype: object

In [22]:
get_recommendations_skills('Write A Feature Length Screenplay For Film Or Television')

1481    bible  film  film studies  Cinematography  Wri...
1629    art  Interior Design  Fiction Writing  languag...
3481    Fiction Writing  film  Writing  determination ...
2186    Peer Review  project  public speaking  write-o...
3445    Writing  Office Administration  business admin...
3384    Communication  Business Writing  email  Busine...
2894    google apps script  project  Planning  email  ...
614     language  ordered pair  Proofreading  essay wr...
2732    grammar  email writing  Note Taking  Writing  ...
104     Business Writing  email writing  Writing  engl...
Name: Skills, dtype: object

## 3 and 4

3.

In [23]:
# Load Datasets
ratings_df = pd.read_csv("Books_Ratings.csv")
ratings_df.head(3)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0


In [24]:
books_df = pd.read_csv("Books.csv")
books_df.head(3)

  books_df = pd.read_csv("Books.csv")


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...


As both of the datasets contain an ISBN column we can join the dataset on this column

In [25]:
# Ensure ISBNs are strings for proper merging
ratings_df["ISBN"] = ratings_df["ISBN"].astype(str)
books_df["ISBN"] = books_df["ISBN"].astype(str)

In [26]:
rating_df = ratings_df.merge(books_df[["ISBN", "Book-Title"]], on="ISBN")


In [27]:
rating_df.head(3)

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title
0,276725,034545104X,0,Flesh Tones: A Novel
1,276726,0155061224,5,Rites of Passage
2,276727,0446520802,0,The Notebook


In [28]:
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1031136 entries, 0 to 1031135
Data columns (total 4 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1031136 non-null  int64 
 1   ISBN         1031136 non-null  object
 2   Book-Rating  1031136 non-null  int64 
 3   Book-Title   1031136 non-null  object
dtypes: int64(2), object(2)
memory usage: 31.5+ MB


In [29]:
rating_df.nunique()

User-ID         92106
ISBN           270151
Book-Rating        11
Book-Title     241071
dtype: int64

Since we want to sort the users based on how many ratings they have given, we will drop

Next we sort the users based on how many book-ratings that they have given. To do this we will group by the User-ID and a a new feature holding the amount of Book-Ratings the specific user has given

In [30]:
rating_df["Book-Rating-Count"] = (rating_df.groupby("User-ID")["Book-Rating"].transform('count'))
rating_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Rating-Count
0,276725,034545104X,0,Flesh Tones: A Novel,1
1,276726,0155061224,5,Rites of Passage,1
2,276727,0446520802,0,The Notebook,1
3,276729,052165615X,3,Help!: Level 1,2
4,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,2


Now we will sort the users based on the column Book-Rating-Count

In [31]:
df_sorted = rating_df.sort_values(by="Book-Rating-Count", ascending=False)
df_sorted

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Rating-Count
39823,11676,0001944711,6,Count Duckula: Vampire Vacation,11144
39822,11676,0001941941,0,Little Grey Rabbit's Christmas (Collins Colour...,11144
39821,11676,0001900277,0,Glue (First Facts - First Skills),11144
39820,11676,0001848445,0,THE COAL HOUSE T/PB,11144
39819,11676,0001841572,0,Red Shift,11144
...,...,...,...,...,...
523324,140915,0330266357,5,Ancestral Vices,1
523323,140909,0812508920,8,China Mountain Zhang,1
911801,243723,0425098400,8,Celestial Navigation,1
911918,243757,0802140505,8,The Everlasting Stream: A True Story of Rabbit...,1


Now we have a count for every user, however the same user will still appear multiple times. We will need to fic this if we want only the top 200 users

In [32]:
# Now drop duplicate User-IDs, keeping only the first row per user
filtered_rating = df_sorted.drop_duplicates(subset="User-ID")
filtered_rating = filtered_rating.head(200)
filtered_rating

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Rating-Count
39823,11676,0001944711,6,Count Duckula: Vampire Vacation,11144
734566,198711,0671543180,0,DEAD MANS FOLLY (Hercule Poirot Mysteries (Pap...,6456
566200,153662,0060082895,0,Captured Innocence (Avon Romantic Treasure),5814
364228,98391,0373750072,8,"Predicting Rain (Harlequin American Romance, 1...",5779
145142,35859,1885171129,0,Calling the Midwest Home: A Lively Look at the...,5646
...,...,...,...,...,...
351009,94923,037541603X,0,Magic Terror,633
692770,187517,0060560754,7,The Dim Sum of All Things,631
982495,262998,0028603702,7,The High-Performance Cookbook: 150 High-Carb R...,630
484338,129716,0062509594,10,Essential Rumi,624


In [33]:
random_user = np.array(filtered_rating.sample(random_state = 13).index)[0]
random_user

np.int64(555570)

In [34]:
random_user_df = filtered_rating[filtered_rating.index == random_user]
random_user_df

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Rating-Count
555570,150124,7612591,9,Detox for Health: The 7-Day Detox Program Comb...,675


In [39]:
def user_based_recommender(input_user, user_book_df, num_recommendations=5):
    # Creating a list of movies the input user have rated
    input_user_df = user_book_df[user_book_df['User-ID'] == input_user]
    input_user_books_rated = input_user_df.columns[input_user_df.notna().any()].tolist()

    # Creating a dataframe with the user rating of the movies the input user have rated
    books_rated_df = user_book_df[input_user_books_rated]

    # Counting how many movies other users have rated that the input user have also rated
    user_book_count = books_rated_df.T.notnull().sum()
    user_book_count = books_rated_count.reset_index()
    user_book_count.columns = ["User-ID", "Book-Rating-Count"]
    
    # Selecting similar users over based on a rating similarity count ratio threshold
    user_same_books = user_book_count[user_book_count["book_count"]] > (len(input_user_books_rated))["User-ID"]

    # Creating a correlation matrix based on ratings
    final_df = books_rated_df[books_rated_df.index.isin(user_same_books)]
    corr_df = final_df.T.corr()

    # Created top correlated users
    user_corr = corr_df[input_user].reset_index()
    user_corr = user_corr.rename(columns={'index': 'User-ID', random_user: 'correlation'})
    user_corr = user_corr.sort_values(by="correlation", ascending=False)
    user_corr = user_corr.loc[user_corr["User-ID"] != input_user]
    user_corr = user_corr.reset_index(drop=True)

    # Creating correlated weighting of rating
    top_users_ratings = user_corr.merge(ratings_df[["User-ID", "ISBN", "rating"]], how="inner")
    top_users_ratings["weighted_rating"] = top_users_ratings["correlation"] * top_users_ratings["rating"]

    users_ratings["weighted_rating"] = users_ratings["correlation"] * users_ratings["Book-Rating"]
    users_ratings

    # Creating a recommendation dataframe
    recommendation_df = users_ratings.groupby("ISBN").agg({"weighted_rating": "mean"}).sort_values(by = "weighted_rating", ascending = False)
    recommendation_df = recommendation_df.reset_index()

    # Creating the final recommendations
    books_to_be_recommended = recommendation_df.merge(books_df[["ISBN", "Book-Title"]], on="ISBN")
    books_to_be_recommended = books_to_be_recommended.head()

    return books_to_be_recommended["Book-Title"]

In [40]:
user_based_recommender('150124', ratings_df)

NameError: name 'books_rated_count' is not defined