In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict, Optional
import requests
import time
import warnings
warnings.filterwarnings('ignore')


class RealTimeJobFetcher:
    """Fetches internship data from job APIs"""

    def __init__(self):
        self.all_jobs = []

    def fetch_arbeitnow_jobs(self) -> List[Dict]:
        """Arbeitnow API - Free, no key required"""
        jobs = []
        try:
            url = "https://www.arbeitnow.com/api/job-board-api"
            response = requests.get(url, timeout=15)

            if response.status_code == 200:
                data = response.json()
                job_list = data.get('data', [])

                for job in job_list:
                    title = job.get('title', '').lower()
                    if any(keyword in title for keyword in ['intern', 'junior', 'entry', 'graduate', 'trainee']):
                        jobs.append({
                            'source': 'Arbeitnow',
                            'job_id': f"arbeit_{job.get('slug', '')}",
                            'company': job.get('company_name', 'Unknown'),
                            'title': job.get('title', ''),
                            'location': job.get('location', 'Remote'),
                            'description': job.get('description', '')[:1000],
                            'category': ', '.join(job.get('tags', [])),
                            'url': job.get('url', ''),
                            'posted_date': job.get('created_at', ''),
                        })
        except:
            pass

        return jobs

    def fetch_remotive_jobs(self) -> List[Dict]:
        """Remotive API - Free, no key required"""
        jobs = []
        try:
            url = "https://remotive.com/api/remote-jobs"
            response = requests.get(url, timeout=15)

            if response.status_code == 200:
                data = response.json()
                job_list = data.get('jobs', [])

                for job in job_list:
                    title = job.get('title', '').lower()

                    if any(keyword in title for keyword in ['intern', 'junior', 'entry', 'graduate']):
                        jobs.append({
                            'source': 'Remotive',
                            'job_id': f"remotive_{job.get('id', '')}",
                            'company': job.get('company_name', 'Unknown'),
                            'title': job.get('title', ''),
                            'location': 'Remote',
                            'description': job.get('description', '')[:1000],
                            'category': job.get('category', ''),
                            'url': job.get('url', ''),
                            'posted_date': job.get('publication_date', ''),
                        })
        except:
            pass

        return jobs

    def fetch_themuse_internships(self, pages: int = 5) -> List[Dict]:
        """The Muse API - Free, no key required"""
        jobs = []

        for page in range(pages):
            try:
                url = "https://www.themuse.com/api/public/jobs"
                params = {
                    'category': 'Internship',
                    'page': page,
                    'descending': 'true'
                }

                response = requests.get(url, params=params, timeout=10)

                if response.status_code == 200:
                    data = response.json()
                    results = data.get('results', [])

                    if not results:
                        break

                    for job in results:
                        locations = job.get('locations', [])
                        location_str = ', '.join([loc.get('name', '') for loc in locations]) if locations else 'Remote'

                        categories = job.get('categories', [])
                        category_str = ', '.join([cat.get('name', '') for cat in categories]) if categories else ''

                        jobs.append({
                            'source': 'TheMuse',
                            'job_id': f"muse_{job.get('id')}",
                            'company': job.get('company', {}).get('name', 'Unknown'),
                            'title': job.get('name', ''),
                            'location': location_str,
                            'description': job.get('contents', '')[:1000],
                            'category': category_str,
                            'url': job.get('refs', {}).get('landing_page', ''),
                            'posted_date': job.get('publication_date', ''),
                        })

                    time.sleep(0.5)
                else:
                    break
            except:
                break

        return jobs

    def fetch_all(self) -> List[Dict]:
        """Fetch from all APIs"""
        all_jobs = []

        arbeit_jobs = self.fetch_arbeitnow_jobs()
        all_jobs.extend(arbeit_jobs)
        time.sleep(1)

        remotive_jobs = self.fetch_remotive_jobs()
        all_jobs.extend(remotive_jobs)
        time.sleep(1)

        muse_jobs = self.fetch_themuse_internships(pages=5)
        all_jobs.extend(muse_jobs)

        self.all_jobs = all_jobs
        return all_jobs


class InternshipRecommender:
    """ML-based recommendation system"""

    def __init__(self):
        self.vectorizer = TfidfVectorizer(
            max_features=200,
            stop_words='english',
            ngram_range=(1, 2),
            min_df=1
        )
        self.internships_df = None
        self.internship_vectors = None
        self.fetcher = RealTimeJobFetcher()

    def fetch_and_load_data(self):
        """Fetch and load data"""
        jobs = self.fetcher.fetch_all()

        if not jobs:
            raise ValueError("Unable to fetch jobs. Please check your internet connection.")

        self.internships_df = pd.DataFrame(jobs)

        self.internships_df['combined_features'] = (
            self.internships_df['title'].fillna('') + ' ' +
            self.internships_df['description'].fillna('') + ' ' +
            self.internships_df['category'].fillna('')
        ).str.lower()

        self.internships_df['location'] = self.internships_df['location'].fillna('Remote')
        self.internships_df['company'] = self.internships_df['company'].fillna('Unknown')

        return self.internships_df

    def train(self):
        """Train ML model"""
        if self.internships_df is None or len(self.internships_df) == 0:
            raise ValueError("No data available")

        self.internship_vectors = self.vectorizer.fit_transform(
            self.internships_df['combined_features']
        )

    def recommend(
        self,
        skills: List[str],
        interests: List[str],
        top_n: int = 5,
        location: Optional[str] = None
    ) -> pd.DataFrame:
        """Get recommendations"""
        if self.internship_vectors is None:
            raise ValueError("Model not trained")

        user_profile = ' '.join(skills + interests).lower()
        user_vector = self.vectorizer.transform([user_profile])

        similarities = cosine_similarity(user_vector, self.internship_vectors).flatten()

        results = self.internships_df.copy()
        results['match_score'] = similarities
        results['match_percentage'] = (similarities * 100).round(2)

        if location:
            results = results[results['location'].str.contains(location, case=False, na=False)]

        top_results = results.nlargest(top_n, 'match_score')

        return top_results[[
            'company', 'title', 'location', 'match_percentage',
            'source', 'url', 'description'
        ]]


def main():
    """Main function"""
    print("\n" + "="*60)
    print("INTERNSHIP RECOMMENDATION SYSTEM")
    print("="*60 + "\n")

    # Get user input
    print("Enter your skills (comma-separated):")
    print("Example: python, java, data analysis, marketing")
    skills_input = input("> ").strip()
    skills = [s.strip() for s in skills_input.split(',') if s.strip()] if skills_input else ['python']

    print("\nEnter your interests (comma-separated):")
    print("Example: software development, AI, finance")
    interests_input = input("> ").strip()
    interests = [i.strip() for i in interests_input.split(',') if i.strip()] if interests_input else ['technology']

    print("\nEnter preferred location (Indian cities):")
    print("Example: Bangalore, Mumbai, Delhi, Hyderabad, Pune, Chennai")
    print("Or type 'Remote' or leave blank for all locations")
    location = input("> ").strip() or None

    print("\nHow many recommendations? (3-5)")
    num_input = input("> ").strip()
    try:
        num_recs = int(num_input)
        num_recs = max(3, min(num_recs, 5))
    except:
        num_recs = 5

    # Fetch and process
    print("\nFetching internships...")
    recommender = InternshipRecommender()

    try:
        recommender.fetch_and_load_data()
        recommender.train()

        recommendations = recommender.recommend(
            skills=skills,
            interests=interests,
            top_n=num_recs,
            location=location
        )

        if len(recommendations) == 0:
            print("\nNo matching internships found. Try different keywords or remove location filter.\n")
            return

        # Display results
        print("\n" + "="*60)
        print(f"TOP {len(recommendations)} INTERNSHIP RECOMMENDATIONS")
        print("="*60 + "\n")

        for idx, (_, row) in enumerate(recommendations.iterrows(), 1):
            print(f"{idx}. {row['title']}")
            print(f"   Company: {row['company']}")
            print(f"   Location: {row['location']}")
            print(f"   Match: {row['match_percentage']}%")
            print(f"   Apply: {row['url']}")
            print()

        print("="*60 + "\n")

    except Exception as e:
        print(f"\nError: {e}\n")


if __name__ == "__main__":
    main()


INTERNSHIP RECOMMENDATION SYSTEM

Enter your skills (comma-separated):
Example: python, java, data analysis, marketing
