In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re # For basic text cleaning

print("Libraries imported successfully.")

student_data = {
    'student_id': [1, 2, 3, 4, 5],
    'skills': [
        'python data-analysis machine-learning pandas numpy tensorflow', # Student 1: ML focused
        'javascript react html css node.js mongodb', # Student 2: Web Dev focused
        'python flask sql postgresql docker aws', # Student 3: Backend/DevOps focused
        'java spring-boot sql microservices hibernate', # Student 4: Java Backend focused
        'data-analysis sql tableau powerbi communication python' # Student 5: Data Analyst focused
    ],
    'interests': [
        'artificial intelligence, big data',
        'frontend development, user experience',
        'cloud computing, api development',
        'enterprise applications, software architecture',
        'business intelligence, data visualization'
    ]
}
df_students = pd.DataFrame(student_data)

# Sample Opportunity Data (Internships/Projects)
opportunity_data = {
    'opportunity_id': [101, 102, 103, 104, 105, 106, 107],
    'title': [
        'Machine Learning Intern',
        'Frontend Developer Intern',
        'Cloud DevOps Engineer Project',
        'Java Backend Developer Intern',
        'Data Analyst Project',
        'Full Stack Developer Intern (MERN)',
        'AI Research Assistant'
    ],
    'description': [
        'Work on developing and deploying machine learning models using Python and TensorFlow.',
        'Build responsive user interfaces using React, HTML, and CSS.',
        'Manage cloud infrastructure on AWS, implement CI/CD pipelines using Docker.',
        'Develop backend services using Java, Spring Boot, and SQL databases.',
        'Analyze datasets, create dashboards using Tableau/PowerBI, and present findings.',
        'Develop end-to-end web applications using MongoDB, Express, React, Node.js.',
        'Assist research team in exploring new AI algorithms, requires strong Python and ML framework knowledge.'
    ],
    'required_skills': [
        'python machine-learning tensorflow pandas scikit-learn numpy', # Matches Student 1
        'react javascript html css git node.js', # Matches Student 2 & 6
        'aws docker kubernetes python linux sql ci-cd', # Matches Student 3
        'java spring-boot sql hibernate rest-api maven', # Matches Student 4
        'sql data-analysis tableau powerbi excel python communication', # Matches Student 5
        'mongodb express react node.js javascript html css git api', # Matches Student 2 & 6
        'python machine-learning deep-learning tensorflow pytorch research' # Matches Student 1
    ]
}
df_opportunities = pd.DataFrame(opportunity_data)

print("\n--- Sample Student Data ---")
print(df_students.head())
print("\n--- Sample Opportunity Data ---")
print(df_opportunities.head())


def clean_text(text):
    text = text.lower() # Lowercase
    text = re.sub(r'[^a-z0-9\s\-]', '', text) # Remove special chars except space and hyphen
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
    return text
df_students['cleaned_skills'] = df_students['skills'].apply(clean_text)
df_opportunities['cleaned_required_skills'] = df_opportunities['required_skills'].apply(clean_text)

tfidf_vectorizer = TfidfVectorizer(stop_words=None) # Using None, as skills are specific keywords


combined_skills = pd.concat([df_students['cleaned_skills'], df_opportunities['cleaned_required_skills']], ignore_index=True)
tfidf_vectorizer.fit(combined_skills)

student_skill_vectors = tfidf_vectorizer.transform(df_students['cleaned_skills'])
opportunity_skill_vectors = tfidf_vectorizer.transform(df_opportunities['cleaned_required_skills'])

print(f"\nShape of Student Skill Vectors: {student_skill_vectors.shape}")
print(f"Shape of Opportunity Skill Vectors: {opportunity_skill_vectors.shape}")
print(f"Number of unique skills (features) identified: {len(tfidf_vectorizer.get_feature_names_out())}")


def get_recommendations(student_index, student_vectors, opportunity_vectors, opportunity_df, top_n=5):
    """
    Calculates cosine similarity and returns top N recommendations for a given student.

    Args:
        student_index (int): The index of the student in the student dataframe/vectors.
        student_vectors (sparse matrix): TF-IDF vectors for all students.
        opportunity_vectors (sparse matrix): TF-IDF vectors for all opportunities.
        opportunity_df (pd.DataFrame): Dataframe containing opportunity details.
        top_n (int): Number of recommendations to return.

    Returns:
        pd.DataFrame: Top N recommended opportunities sorted by similarity score.
    """
    if student_index >= student_vectors.shape[0]:
        print(f"Error: student_index {student_index} is out of bounds.")
        return pd.DataFrame()

    # Get the vector for the specific student
    student_vector = student_vectors[student_index]


    cosine_similarities = cosine_similarity(student_vector, opportunity_vectors).flatten()


    recommendation_df = opportunity_df.copy()
    recommendation_df['similarity_score'] = cosine_similarities


    recommendation_df = recommendation_df.sort_values(by='similarity_score', ascending=False)

    return recommendation_df.head(top_n)


student_id_to_recommend = 1 # Corresponds to index 0
student_index = df_students[df_students['student_id'] == student_id_to_recommend].index[0]

print(f"\n--- Recommendations for Student ID: {student_id_to_recommend} ---")
print(f"Student Skills: {df_students.loc[student_index, 'skills']}")

recommendations = get_recommendations(student_index, student_skill_vectors, opportunity_skill_vectors, df_opportunities, top_n=3)

if not recommendations.empty:
    print("\nTop 3 Recommended Opportunities:")
    print(recommendations[['opportunity_id', 'title', 'required_skills', 'similarity_score']])
else:
    print("Could not generate recommendations.")

student_id_to_recommend = 2 # Corresponds to index 1
student_index = df_students[df_students['student_id'] == student_id_to_recommend].index[0]

print(f"\n--- Recommendations for Student ID: {student_id_to_recommend} ---")
print(f"Student Skills: {df_students.loc[student_index, 'skills']}")

recommendations = get_recommendations(student_index, student_skill_vectors, opportunity_skill_vectors, df_opportunities, top_n=3)

if not recommendations.empty:
    print("\nTop 3 Recommended Opportunities:")
    print(recommendations[['opportunity_id', 'title', 'required_skills', 'similarity_score']])
else:
    print("Could not generate recommendations.")



print("\n--- Potential Next Steps ---")
print("1. Use Real Data: Replace sample data with actual student profiles and opportunity listings.")
print("2. Advanced NLP: Integrate Gemini API or more sophisticated NLP (e.g., spaCy for NER) for better skill extraction from descriptions or resumes.")
print("3. Include More Features: Add interests, project descriptions, location preferences, etc., into the matching logic (might require different modeling techniques).")
print("4. Hybrid Models: Combine content-based filtering with collaborative filtering (if interaction data like applications/clicks becomes available).")
print("5. Evaluation: Implement metrics to evaluate recommendation quality (e.g., precision@k, recall@k) using a held-out test set or user feedback.")
print("6. Fine-tuning: Experiment with TF-IDF parameters (min_df, max_df, ngrams) or try other vectorization methods (e.g., Word2Vec, Sentence-BERT).")
print("7. Deployment: Integrate this logic into a backend framework like FastAPI (as mentioned in your plan).")

Libraries imported successfully.

--- Sample Student Data ---
   student_id                                             skills  \
0           1  python data-analysis machine-learning pandas n...   
1           2          javascript react html css node.js mongodb   
2           3             python flask sql postgresql docker aws   
3           4       java spring-boot sql microservices hibernate   
4           5  data-analysis sql tableau powerbi communicatio...   

                                        interests  
0               artificial intelligence, big data  
1           frontend development, user experience  
2                cloud computing, api development  
3  enterprise applications, software architecture  
4       business intelligence, data visualization  

--- Sample Opportunity Data ---
   opportunity_id                          title  \
0             101        Machine Learning Intern   
1             102      Frontend Developer Intern   
2             103  Cloud Dev