In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('../dataset/final_courses_data.csv')

In [3]:
data

Unnamed: 0,id,title,description,level,rating,reviews_num,link,image,combined_features
0,3f63dea7-ba56-4b8b-a759-c40cdfc4c9d5,(ISC)² Systems Security Certified Practitioner...,Pursue better IT security job opportunities an...,0,4.7,492.0,https://www.coursera.org/specializations/sscp-...,-,(ISC)² Systems Security Certified Practitioner...
1,f942f9e2-bf39-4516-8508-d2069def9a67,.NET FullStack Developer,Develop the proficiency required to design and...,1,4.3,51.0,https://www.coursera.org/specializations/dot-n...,-,.NET FullStack Developer Develop the proficien...
2,e3a85712-7797-415f-90c1-98763d92717f,21st Century Energy Transition: how do we make...,"Affordable, abundant and reliable energy is fu...",0,4.8,62.0,https://www.coursera.org/learn/21st-century-en...,-,21st Century Energy Transition: how do we make...
3,bf33fa5d-722f-4b18-b5e6-33c46c8d4484,A Crash Course in Causality: Inferring Causal...,We have all heard the phrase “correlation does...,1,4.7,517.0,https://www.coursera.org/learn/crash-course-in...,-,A Crash Course in Causality: Inferring Causal...
4,c6469078-25fc-4451-91b0-6436d9c40a8b,AI Applications in Marketing and Finance,"In this course, you will learn about AI-powere...",3,4.7,140.0,https://www.coursera.org/learn/wharton-ai-appl...,-,AI Applications in Marketing and Finance In th...
...,...,...,...,...,...,...,...,...,...
5931,29f79bd5-2b16-4a5f-bc35-1db91f42f406,Game Theory Algorithms in Competitive Programm...,"Dive deep into game theory algorithms, learn &...",3,-,124 reviews,https://www.udemy.com/course/game-theory-algor...,https://img-b.udemycdn.com/course/240x135/3878...,Game Theory Algorithms in Competitive Programm...
5932,cd6c35e8-0ce1-485e-aa08-3c77d3c35568,"Siemens WinCC SCADA Programming, SCADA1 ( Basic )",This course is a great push for any one who wa...,2,-,124 reviews,https://www.udemy.com/course/siemens-wincc-scada/,https://img-b.udemycdn.com/course/240x135/2858...,"Siemens WinCC SCADA Programming, SCADA1 ( Basi..."
5933,63b8bf7f-dead-490b-95f9-c6054bc99a0c,Python Object Oriented Programming (OOP): Begi...,Deep OOP Foundations From Absolute Scratch,3,-,124 reviews,https://www.udemy.com/course/object-oriented-p...,https://img-b.udemycdn.com/course/240x135/4450...,Python Object Oriented Programming (OOP): Begi...
5934,381f0cdd-fab1-4200-b8a6-351a26de92eb,jQuery Basics Guide,Everything you need to know to Build a Retirem...,0,-,124 reviews,https://www.udemy.com/course/learn-basic-jquery/,https://img-b.udemycdn.com/course/240x135/2554...,jQuery Basics Guide Everything you need to kno...


In [4]:
import pickle

data.to_pickle('../pickle/courses_data.pkl')

### Exact Match + Fuzzy Search

In [5]:
from fuzzywuzzy import process
import re

def normalize_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

def search_courses(query):
    normalized_query = normalize_text(query)
    data['normalized_title'] = data['title'].apply(normalize_text)
    
    exact_matches = data[data['normalized_title'].str.contains(normalized_query, na=False)]
    non_exact_titles = data[~data['normalized_title'].str.contains(normalized_query, na=False)]['title'].tolist()
    fuzzy_results = process.extract(query, non_exact_titles, limit=5)
    
    exact_match_results = exact_matches[['id', 'title', 'normalized_title']]
    exact_match_results['similarity'] = 100

    fuzzy_match_df = pd.DataFrame(fuzzy_results, columns=['title', 'similarity'])
    fuzzy_match_df = fuzzy_match_df[fuzzy_match_df['similarity'] > 0]
    fuzzy_match_df = fuzzy_match_df.merge(data[['title', 'id']], on='title', how='left')
    
    combined_results = pd.concat([exact_match_results[['id', 'similarity']], fuzzy_match_df[['id', 'similarity']]], ignore_index=True)
    combined_results = combined_results.sort_values(by='similarity', ascending=False)

    return combined_results

In [6]:
query = "Data Science Course"
# query = input("enter course name or keywords:")

results = search_courses(query)

# for index, row in results.iterrows():
#     print(f"{row['id']}")

course_ids_array = results['id'].to_numpy()
print(course_ids_array)

['a7835640-f4e6-4e40-881c-17d93b68240b'
 'bf33fa5d-722f-4b18-b5e6-33c46c8d4484'
 'ad6ffefb-3966-4378-ac2b-87e40e506ce6'
 '2aaa10c6-d048-4e63-bad9-722cc1aa6f87'
 '3141d834-a861-4e5f-9839-e2bf31e765c8'
 'ea7d78ef-77e3-4f9c-a608-aaac13974ad7']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exact_match_results['similarity'] = 100


In [7]:
with open('../pickle/results_model.pkl', 'wb') as file:
    pickle.dump(results, file)