In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib # For saving/loading model components
import os
import re # For basic text cleaning


In [8]:
COURSERA_DATA_PATH = '/content/Coursera.csv'

In [9]:
MODEL_DIR = 'course_recommender_model' # Directory to save model files
VECTORIZER_PATH = os.path.join(MODEL_DIR, 'tfidf_vectorizer.joblib')
COURSE_MATRIX_PATH = os.path.join(MODEL_DIR, 'course_tfidf_matrix.joblib')
COURSE_METADATA_PATH = os.path.join(MODEL_DIR, 'course_metadata.joblib')

In [10]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
    return text

In [11]:
def load_and_preprocess_data(file_path):
    print(f"Loading data from {file_path}...")
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: Dataset file not found at {file_path}")
        print("Please download 'Coursera_courses.csv' from Kaggle and place it in the correct path.")
        return None

    # Select relevant columns and handle missing values
    # Renaming for easier access
    df = df.rename(columns={
        'Course Name': 'course_title',
        'Course Description': 'course_description',
        'Skills': 'course_skills'
    })

    # Ensure essential columns exist
    required_cols = ['course_title', 'course_description', 'course_skills', 'Course URL']
    for col in required_cols:
        if col not in df.columns:
            print(f"Error: Required column '{col}' not found in the dataset.")
            return None

    df['course_description'] = df['course_description'].fillna('')
    df['course_skills'] = df['course_skills'].fillna('')
    df['course_title'] = df['course_title'].fillna('') # Title is crucial

    # Drop rows where title is empty after fillna (shouldn't happen if CSV is good)
    df.dropna(subset=['course_title'], inplace=True)
    df = df[df['course_title'].str.strip() != '']


    # Combine text features for TF-IDF
    # Giving skills a bit more prominence by repeating them (optional heuristic)
    df['combined_features'] = df['course_title'].apply(clean_text) + ' ' + \
                              df['course_description'].apply(clean_text) + ' ' + \
                              (df['course_skills'].apply(clean_text) + ' ') * 2 # Repeat skills

    # Drop duplicates based on title and description (or URL if more robust)
    df.drop_duplicates(subset=['course_title', 'course_description'], inplace=True)
    df.reset_index(drop=True, inplace=True)

    print(f"Data loaded and preprocessed. Shape: {df.shape}")
    return df[['course_title', 'Course URL', 'combined_features', 'course_skills', 'course_description']]


In [12]:
def load_model_components():
    print("Loading model components...")
    if not all([os.path.exists(p) for p in [VECTORIZER_PATH, COURSE_MATRIX_PATH, COURSE_METADATA_PATH]]):
        print("Model components not found. Please train the model first.")
        return None, None, None

    vectorizer = joblib.load(VECTORIZER_PATH)
    course_matrix = joblib.load(COURSE_MATRIX_PATH)
    course_metadata = joblib.load(COURSE_METADATA_PATH)
    print("Model components loaded successfully.")
    return vectorizer, course_matrix, course_metadata

In [13]:
def get_course_recommendations(job_title, job_description, top_n=3):
    vectorizer, course_matrix, course_metadata = load_model_components()

    if vectorizer is None:
        print("Model not loaded. Cannot provide recommendations.")
        return []

    # Combine job title and description to form the query
    query_text = clean_text(job_title + " " + job_description)
    if not query_text.strip():
        print("Query text is empty after cleaning. Cannot make recommendations.")
        return []

    # Transform the query using the loaded vectorizer
    query_vector = vectorizer.transform([query_text])

    # Calculate cosine similarity
    cosine_similarities = cosine_similarity(query_vector, course_matrix).flatten()

    # Get indices of top_n most similar courses
    # argsort returns indices that would sort the array. We take the last 'top_n' in reverse.
    related_courses_indices = cosine_similarities.argsort()[-top_n:][::-1]

    recommendations = []
    print(f"\nTop {top_n} course recommendations for '{job_title}':")
    for i, index in enumerate(related_courses_indices):
        similarity_score = cosine_similarities[index]
        if similarity_score > 0.01 : # Only recommend if there's some minimal similarity
            course_info = course_metadata.iloc[index]
            recommendations.append({
                "title": course_info['course_title'],
                "url": course_info['Course URL'],
                "similarity": similarity_score,
                "description": course_info['course_description'][:200] + "..." if course_info['course_description'] else "N/A", # Snippet
                "skills": course_info['course_skills'] if course_info['course_skills'] else "N/A"
            })
            print(f"  {i+1}. {course_info['course_title']} (Similarity: {similarity_score:.4f})")
            print(f"      URL: {course_info['Course URL']}")
            # print(f"      Skills: {course_info['course_skills']}")

    if not recommendations:
        print("No suitable courses found matching the criteria.")

    return recommendations


In [15]:
def train_model(df):
    if df is None or df.empty:
        print("Cannot train model: DataFrame is empty or None.")
        return None, None, None

    print("Training TF-IDF vectorizer...")
    # Using stop_words='english' and ngram_range for better feature capture
    # min_df=2: ignore terms that appear in less than 2 documents
    # max_df=0.8: ignore terms that appear in more than 80% of documents (too common)
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=2, max_df=0.8)

    # Fit and transform the combined features
    course_tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])
    print(f"TF-IDF matrix shape: {course_tfidf_matrix.shape}")

    # Save the components
    os.makedirs(MODEL_DIR, exist_ok=True)
    joblib.dump(tfidf_vectorizer, VECTORIZER_PATH)
    joblib.dump(course_tfidf_matrix, COURSE_MATRIX_PATH)
    joblib.dump(df[['course_title', 'Course URL', 'course_skills', 'course_description']], COURSE_METADATA_PATH) # Save metadata for recommendations

    print(f"Model components saved to {MODEL_DIR}/")
    return tfidf_vectorizer, course_tfidf_matrix, df[['course_title', 'Course URL', 'course_skills', 'course_description']]


In [16]:
if __name__ == "__main__":
    # --- Step 1: (One-time or periodic) Training ---
    # Check if model components exist, if not, train and save.
    if not all([os.path.exists(p) for p in [VECTORIZER_PATH, COURSE_MATRIX_PATH, COURSE_METADATA_PATH]]):
        print("Model components not found. Training new model...")
        coursera_df = load_and_preprocess_data(COURSERA_DATA_PATH)
        if coursera_df is not None and not coursera_df.empty:
            train_model(coursera_df)
        else:
            print("Failed to load data. Exiting.")
            exit()
    else:
        print("Found existing model components. Loading them for prediction.")

    # --- Step 2: (For each prediction request) Getting Recommendations ---
    sample_job_title = "Data Scientist"
    sample_job_description = """
    We are looking for a Data Scientist to analyze large amounts of raw information
    to find patterns that will help improve our company. We will rely on you to build
    data products to extract valuable business insights. In this role, you should be
    highly analytical with a knack for analysis, math, and statistics. Critical thinking
    and problem-solving skills are essential for interpreting data. We want to see a passion
    for machine-learning and research. Your goal will be to help our company analyze trends
    to make better decisions. Responsibilities include undertaking data collection,
    preprocessing and analysis, building models to address business problems, and presenting
    information using data visualization techniques. Skills in Python, R, SQL, and machine learning are required.
    Familiarity with data frameworks like Spark or Hadoop is a plus.
    """

    recommendations = get_course_recommendations(sample_job_title, sample_job_description, top_n=3)

    # Example of how you might use the output
    if recommendations:
        print("\n--- Formatted Recommendations ---")
        for rec in recommendations:
            print(f"Title: {rec['title']}\nURL: {rec['url']}\nSimilarity: {rec['similarity']:.4f}\n")



Model components not found. Training new model...
Loading data from /content/Coursera.csv...
Data loaded and preprocessed. Shape: (3424, 8)
Training TF-IDF vectorizer...
TF-IDF matrix shape: (3424, 55862)
Model components saved to course_recommender_model/
Loading model components...
Model components loaded successfully.

Top 3 course recommendations for 'Data Scientist':
  1. SQL for Data Science (Similarity: 0.2660)
      URL: https://www.coursera.org/learn/sql-for-data-science
  2. Data Visualization with Python (Similarity: 0.2233)
      URL: https://www.coursera.org/learn/python-for-data-visualization
  3. Introduction to Data Analytics (Similarity: 0.2123)
      URL: https://www.coursera.org/learn/introduction-to-data-analytics

--- Formatted Recommendations ---
Title: SQL for Data Science
URL: https://www.coursera.org/learn/sql-for-data-science
Similarity: 0.2660

Title: Data Visualization with Python
URL: https://www.coursera.org/learn/python-for-data-visualization
Similarity: 