In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [23]:
# students_data = [
#     {'Student_ID': 'S1', 'Skills': {'Python', 'ML'}, 'Location_Preferences': {'Bangalore'}, 'Qualification': 'B.Tech'},
#     {'Student_ID': 'S2', 'Skills': {'Java', 'SQL'}, 'Location_Preferences': {'Mumbai'}, 'Qualification': 'B.Sc'}
# ]

# students_df = pd.DataFrame(students_data)


# Students
students_data = [
    {
        'Student_ID': 'S1',
        'Name': 'Alice',
        'Skills': {'Python', 'ML', 'SQL'},
        'Qualification': 'B.Tech',
        'Location_Preferences': {'Bangalore', 'Delhi'},
        'Sector_Interests': {'AI', 'Data Science'},
        'Social_Category': 'General',
        'Past_Internship_Participation': False,
        'Preferred_Job_Type': 'Full-time',
        'Experience': 0
    },
    {
        'Student_ID': 'S2',
        'Name': 'Bob',
        'Skills': {'Java', 'SQL'},
        'Qualification': 'B.Sc',
        'Location_Preferences': {'Mumbai'},
        'Sector_Interests': {'Web', 'Software'},
        'Social_Category': 'SC',
        'Past_Internship_Participation': True,
        'Preferred_Job_Type': 'Part-time',
        'Experience': 1
    }
]

students_df = pd.DataFrame(students_data)

# Internships
internships_data = [
    {
        'Job_ID': 'J1',
        'Job_Title': 'AI Intern',
        'Job_Type': 'Full-time',
        'Company_Name': 'TechCorp',
        'Sector': 'AI',
        'Posted_Date': '2025-09-01',
        'Cities': {'Bangalore'},
        'States': {'Karnataka'},
        'Skills_Required': {'Python', 'ML'},
        'Qualification_Required': {'B.Tech', 'M.Tech'},
        'Experience_Required': 0,
        'Stipend': 20000,
        'Duration': '3 months',
        'Number_of_Openings': 2,
        'Actively_Hiring': True,
        'Description': 'Work on AI projects',
        'Links': 'https://techcorp.com/ai-internship',
        'Diversity_Preferences': {'SC', 'ST', 'OBC'},
        'Past_Participation': False,
        'Tags': {'AI', 'ML'}
    },
    {
        'Job_ID': 'J2',
        'Job_Title': 'Java Developer Intern',
        'Job_Type': 'Part-time',
        'Company_Name': 'WebSolutions',
        'Sector': 'Web',
        'Posted_Date': '2025-09-05',
        'Cities': {'Mumbai'},
        'States': {'Maharashtra'},
        'Skills_Required': {'Java', 'SQL'},
        'Qualification_Required': {'B.Sc', 'B.Tech'},
        'Experience_Required': 0,
        'Stipend': 15000,
        'Duration': '2 months',
        'Number_of_Openings': 1,
        'Actively_Hiring': True,
        'Description': 'Web development projects',
        'Links': 'https://websolutions.com/java-internship',
        'Diversity_Preferences': {'General', 'SC'},
        'Past_Participation': False,
        'Tags': {'Java', 'Web'}
    },
    {
        'Job_ID': 'J3',
        'Job_Title': 'Data Science Intern',
        'Job_Type': 'Full-time',
        'Company_Name': 'DataCorp',
        'Sector': 'Data Science',
        'Posted_Date': '2025-09-10',
        'Cities': {'Delhi'},
        'States': {'Delhi'},
        'Skills_Required': {'Python', 'SQL'},
        'Qualification_Required': {'B.Tech', 'M.Tech'},
        'Experience_Required': 0,
        'Stipend': 18000,
        'Duration': '4 months',
        'Number_of_Openings': 3,
        'Actively_Hiring': True,
        'Description': 'Data analysis and ML projects',
        'Links': 'https://datacorp.com/ds-internship',
        'Diversity_Preferences': {'General', 'OBC'},
        'Past_Participation': False,
        'Tags': {'Data', 'ML'}
    }
]

internships_df = pd.DataFrame(internships_data)


In [24]:
students_df['key'] = 1
internships_df['key'] = 1

pair_df = pd.merge(students_df, internships_df, on='key').drop('key', axis=1)


def compute_features(row):
    # Skill overlap
    skill_overlap = len(row['Skills'].intersection(row['Skills_Required']))
    
    # Location match
    location_match = int(len(row['Location_Preferences'].intersection(row['Cities'])) > 0)
    
    # Sector match
    sector_match = int(len(row['Sector_Interests'].intersection({row['Sector']})) > 0)
    
    # Qualification match
    qualification_match = int(row['Qualification'] in row['Qualification_Required'])
    
    # Past participation
    past_participation = int(not row['Past_Internship_Participation'])
    
    # Diversity match
    diversity_match = int(row['Social_Category'] in row['Diversity_Preferences'])
    
    return pd.Series({
        'Skill_overlap': skill_overlap,
        'Location_match': location_match,
        'Sector_match': sector_match,
        'Qualification_match': qualification_match,
        'Past_participation': past_participation,
        'Diversity_match': diversity_match
    })

features_df = pair_df.apply(compute_features, axis=1)
pair_df = pd.concat([pair_df, features_df], axis=1)


In [25]:
# Create Labels (for training, example)
# For demonstration, label = 1 if skill overlap > 0 and qualification match = 1
pair_df['Label'] = np.where((pair_df['Skill_overlap'] > 0) & (pair_df['Qualification_match'] == 1), 1, 0)

# Train Model
X = pair_df[['Skill_overlap', 'Location_match', 'Sector_match', 'Qualification_match', 'Past_participation', 'Diversity_match']]
y = pair_df['Label']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

model = LogisticRegression()
model.fit(X_scaled, y)

# Predict Match Probability
pair_df['Predicted_Probability'] = model.predict_proba(X_scaled)[:, 1]



In [26]:
# Recommend Top 5 Internships per Student
top_recommendations = pair_df.groupby('Student_ID').apply(
    lambda x: x.nlargest(5, 'Predicted_Probability')
).reset_index(drop=True)

# Display student, internship, probability
print(top_recommendations[['Student_ID', 'Job_ID', 'Job_Title', 'Predicted_Probability']])


  Student_ID Job_ID              Job_Title  Predicted_Probability
0         S1     J3    Data Science Intern               0.969333
1         S1     J1              AI Intern               0.948180
2         S1     J2  Java Developer Intern               0.824289
3         S2     J2  Java Developer Intern               0.918407
4         S2     J3    Data Science Intern               0.173297
5         S2     J1              AI Intern               0.166478


  top_recommendations = pair_df.groupby('Student_ID').apply(
