In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("siddharthm1698/coursera-course-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/coursera-course-dataset


In [8]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

class SkillGapAnalyzer:
    def __init__(self, coursera_dataset_path, debug=False):
        """
        Initialize v·ªõi dataset Coursera t·ª´ Kaggle
        Dataset c·∫ßn c√≥ c√°c c·ªôt: 'course_name', 'skills', 'course_url', etc.
        """
        self.courses_df = pd.read_csv(coursera_dataset_path)
        self.debug = debug
        
        if debug:
            print("üìä Dataset Info:")
            print(f"Columns: {self.courses_df.columns.tolist()}")
            print(f"Shape: {self.courses_df.shape}")
            print("\nFirst row sample:")
            print(self.courses_df.iloc[0])
            print("\n" + "="*80 + "\n")
        
        self.vectorizer = TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=1000,
            stop_words='english',
            min_df=1  # Cho ph√©p t·ª´ xu·∫•t hi·ªán √≠t nh·∫•t 1 l·∫ßn
        )
        self._preprocess_courses()
        
    def _preprocess_courses(self):
        """Ti·ªÅn x·ª≠ l√Ω dataset kh√≥a h·ªçc"""
        # T√¨m c·ªôt skills trong dataset - m·ªü r·ªông t√¨m ki·∫øm
        skill_column = None
        possible_names = ['skills', 'Skills', 'skill', 'Skill', 'course_skills', 
                          'Course Skills', 'what_you_will_learn', 'description',
                          'course_description', 'learn']
        
        for col in possible_names:
            if col in self.courses_df.columns:
                skill_column = col
                break
        
        # N·∫øu v·∫´n kh√¥ng t√¨m th·∫•y, t√¨m c·ªôt c√≥ ch·ª©a t·ª´ 'skill' ho·∫∑c 'learn'
        if not skill_column:
            for col in self.courses_df.columns:
                if 'skill' in col.lower() or 'learn' in col.lower() or 'describe' in col.lower():
                    skill_column = col
                    break
        
        # X·ª≠ l√Ω c·ªôt skills
        if skill_column:
            # Chuy·ªÉn t·∫•t c·∫£ v·ªÅ string v√† x·ª≠ l√Ω NaN
            self.courses_df['skills_clean'] = (
                self.courses_df[skill_column]
                .fillna('')
                .astype(str)
                .str.lower()
                .str.strip()
            )
            if self.debug:
                print(f"‚úÖ Using column: '{skill_column}'")
                print(f"Sample values:\n{self.courses_df['skills_clean'].head(3)}\n")
        else:
            # N·∫øu kh√¥ng c√≥ c·ªôt skills, k·∫øt h·ª£p nhi·ªÅu c·ªôt
            print("‚ö†Ô∏è  Warning: No clear 'skills' column found.")
            print(f"Available columns: {self.courses_df.columns.tolist()}")
            
            # Th·ª≠ k·∫øt h·ª£p t√™n kh√≥a h·ªçc + c·ªôt ƒë·∫ßu ti√™n
            text_cols = []
            for col in self.courses_df.columns[:5]:  # L·∫•y 5 c·ªôt ƒë·∫ßu
                if self.courses_df[col].dtype == 'object':
                    text_cols.append(col)
            
            if text_cols:
                self.courses_df['skills_clean'] = (
                    self.courses_df[text_cols].fillna('')
                    .astype(str)
                    .agg(' '.join, axis=1)
                    .str.lower()
                    .str.strip()
                )
                print(f"Using combined columns: {text_cols}")
        
        # Lo·∫°i b·ªè c√°c d√≤ng c√≥ skills r·ªóng
        original_count = len(self.courses_df)
        self.courses_df = self.courses_df[self.courses_df['skills_clean'] != ''].reset_index(drop=True)
        
        print(f"‚úÖ Loaded {len(self.courses_df)} courses (removed {original_count - len(self.courses_df)} empty rows)")
        
        # Fit vectorizer tr√™n to√†n b·ªô skills
        if len(self.courses_df) > 0:
            self.course_vectors = self.vectorizer.fit_transform(self.courses_df['skills_clean'])
        else:
            raise ValueError("‚ùå No valid courses found in dataset!")
    
    def normalize_skills(self, skills_list):
        """
        Chu·∫©n h√≥a skills ƒë·ªÉ x·ª≠ l√Ω c√°c t·ª´ kh√≥a kh√°c nhau
        """
        normalized = []
        skill_mapping = {
            # Programming languages
            'python': ['python', 'python3', 'py'],
            'javascript': ['javascript', 'js', 'node.js', 'nodejs'],
            'java': ['java', 'java programming'],
            'sql': ['sql', 'mysql', 'postgresql', 'database'],
            
            # Data Science
            'machine learning': ['machine learning', 'ml', 'deep learning', 'ai'],
            'data analysis': ['data analysis', 'data analytics', 'analytics'],
            'data visualization': ['data visualization', 'tableau', 'power bi', 'visualization'],
            
            # Web Development
            'html': ['html', 'html5'],
            'css': ['css', 'css3', 'styling'],
            'react': ['react', 'reactjs', 'react.js'],
            
            # Cloud & DevOps
            'aws': ['aws', 'amazon web services', 'cloud'],
            'docker': ['docker', 'containerization', 'containers'],
            'kubernetes': ['kubernetes', 'k8s', 'orchestration'],
            
            # Soft Skills
            'communication': ['communication', 'verbal communication', 'presentation'],
            'leadership': ['leadership', 'team management', 'management'],
            'project management': ['project management', 'agile', 'scrum'],
        }
        
        for skill in skills_list:
            skill_lower = skill.lower().strip()
            matched = False
            
            for standard_skill, variants in skill_mapping.items():
                if any(variant in skill_lower for variant in variants):
                    if standard_skill not in normalized:
                        normalized.append(standard_skill)
                    matched = True
                    break
            
            if not matched:
                normalized.append(skill_lower)
        
        return normalized
    
    def find_skill_gaps(self, user_skills, required_skills):
        """
        T√¨m k·ªπ nƒÉng c√≤n thi·∫øu
        """
        # Chu·∫©n h√≥a skills
        user_skills_norm = set(self.normalize_skills(user_skills))
        required_skills_norm = set(self.normalize_skills(required_skills))
        
        # T√¨m skills c√≤n thi·∫øu
        missing_skills = required_skills_norm - user_skills_norm
        
        return list(missing_skills)
    
    def recommend_courses(self, missing_skills, top_n=5):
        """
        G·ª£i √Ω kh√≥a h·ªçc d·ª±a tr√™n skills c√≤n thi·∫øu
        """
        if not missing_skills:
            return pd.DataFrame()
        
        # T·∫°o query t·ª´ missing skills
        query = ' '.join(missing_skills)
        
        if self.debug:
            print(f"\nüîç Query: {query}")
        
        query_vector = self.vectorizer.transform([query])
        
        # T√≠nh cosine similarity
        similarities = cosine_similarity(query_vector, self.course_vectors).flatten()
        
        if self.debug:
            print(f"Similarity range: {similarities.min():.4f} to {similarities.max():.4f}")
            print(f"Non-zero similarities: {(similarities > 0).sum()}")
        
        # L·∫•y top N kh√≥a h·ªçc
        top_indices = similarities.argsort()[-top_n:][::-1]
        
        # T·∫°o DataFrame k·∫øt qu·∫£
        recommendations = self.courses_df.iloc[top_indices].copy()
        recommendations['relevance_score'] = similarities[top_indices]
        recommendations['missing_skills_matched'] = recommendations['skills_clean'].apply(
            lambda x: [skill for skill in missing_skills if skill in str(x).lower()]
        )
        
        return recommendations
    
    def generate_report(self, user_skills, required_skills, top_n=5):
        """
        T·∫°o b√°o c√°o ho√†n ch·ªânh
        """
        print("=" * 80)
        print("SKILL GAP ANALYSIS REPORT")
        print("=" * 80)
        
        # 1. Hi·ªÉn th·ªã skills hi·ªán t·∫°i
        print("\nüìä YOUR CURRENT SKILLS:")
        for skill in user_skills:
            print(f"  ‚úì {skill}")
        
        # 2. Hi·ªÉn th·ªã skills y√™u c·∫ßu
        print("\nüéØ REQUIRED SKILLS:")
        for skill in required_skills:
            print(f"  ‚Ä¢ {skill}")
        
        # 3. T√¨m skill gaps
        missing_skills = self.find_skill_gaps(user_skills, required_skills)
        
        print("\n‚ö†Ô∏è  MISSING SKILLS:")
        if missing_skills:
            for skill in missing_skills:
                print(f"  ‚úó {skill}")
        else:
            print("  üéâ Congratulations! You have all required skills!")
            return None
        
        # 4. G·ª£i √Ω kh√≥a h·ªçc
        recommendations = self.recommend_courses(missing_skills, top_n)
        
        print(f"\nüìö TOP {top_n} RECOMMENDED COURSES:")
        print("-" * 80)
        
        # T√¨m c·ªôt t√™n kh√≥a h·ªçc
        name_columns = ['course_name', 'Course Name', 'name', 'Name', 'title', 'Title', 'course_title']
        course_name_col = None
        
        for col in name_columns:
            if col in recommendations.columns:
                course_name_col = col
                break
        
        # N·∫øu kh√¥ng t√¨m th·∫•y, d√πng c·ªôt ƒë·∫ßu ti√™n
        if not course_name_col:
            course_name_col = recommendations.columns[0]
        
        # T√¨m c·ªôt URL
        url_columns = ['course_url', 'Course URL', 'url', 'URL', 'link', 'Link']
        url_col = None
        for col in url_columns:
            if col in recommendations.columns:
                url_col = col
                break
        
        for idx, (_, row) in enumerate(recommendations.iterrows(), 1):
            course_name = row.get(course_name_col, 'Unknown Course')
            print(f"\n{idx}. {course_name}")
            print(f"   Relevance Score: {row['relevance_score']:.3f}")
            
            if row['missing_skills_matched']:
                print(f"   Skills Covered: {', '.join(row['missing_skills_matched'])}")
            else:
                print(f"   Skills Covered: (check course details)")
            
            if url_col and pd.notna(row.get(url_col)):
                print(f"   URL: {row[url_col]}")
        
        print("\n" + "=" * 80)
        
        return recommendations


# ===== EXAMPLE USAGE =====
if __name__ == "__main__":
    # 1. Load dataset t·ª´ Kaggle v·ªõi debug mode
    analyzer = SkillGapAnalyzer('/kaggle/input/coursera-course-dataset/coursea_data.csv', debug=True)
    
    # 2. Define user skills and required skills
    user_skills = [
        'Python',
        'HTML',
        'CSS',
        'SQL'
    ]
    
    required_skills = [
        'Python',
        'Machine Learning',
        'Deep Learning',
        'SQL',
        'AWS',
        'Docker',
        'React'
    ]
    
    # 3. Generate report
    recommendations = analyzer.generate_report(user_skills, required_skills, top_n=5)
    
    # 4. Save recommendations to CSV
    if recommendations is not None:
        recommendations.to_csv('recommended_courses.csv', index=False)
        print("\n‚úÖ Recommendations saved to 'recommended_courses.csv'")

üìä Dataset Info:
Columns: ['Unnamed: 0', 'course_title', 'course_organization', 'course_Certificate_type', 'course_rating', 'course_difficulty', 'course_students_enrolled']
Shape: (891, 7)

First row sample:
Unnamed: 0                                                                134
course_title                (ISC)¬≤ Systems Security Certified Practitioner...
course_organization                                                    (ISC)¬≤
course_Certificate_type                                        SPECIALIZATION
course_rating                                                             4.7
course_difficulty                                                    Beginner
course_students_enrolled                                                 5.3k
Name: 0, dtype: object


Available columns: ['Unnamed: 0', 'course_title', 'course_organization', 'course_Certificate_type', 'course_rating', 'course_difficulty', 'course_students_enrolled']
Using combined columns: ['course_title', 'course_or