In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import joblib
from collections import Counter

# Define the clean_skills function before loading joblib
def clean_skills(skill_text):
    """Enhanced skill cleaner that properly handles skill groups"""
    if pd.isna(skill_text):
        return []
    
    skills = []
    for line in str(skill_text).split('\n'):
        # Remove labels (e.g., "Programming language:")
        line = re.sub(r'^[^:]+:', '', line).strip()
        # Split by commas but preserve skill groups (Java/Python/C++)
        for skill_group in re.split(r'[,]', line):
            skill_group = skill_group.strip()
            if skill_group:
                skills.append(skill_group)
    return list(set(skills))  # Remove duplicates

# Load and preprocess data
df = pd.read_excel('PlacementDataset.xlsx', sheet_name='CLG DRIVES')
df.columns = df.columns.str.strip()

df['cleaned_skills'] = df['skills'].apply(clean_skills)
df['skills_text'] = df['cleaned_skills'].apply(lambda x: ' '.join(x))

# Target variable
median_selection_rate = (df['SELECTED NO. OF STUDENTS'] / df['NO.OF PARTICIPANTS']).median()
df['is_successful'] = (df['SELECTED NO. OF STUDENTS'] / df['NO.OF PARTICIPANTS']) > median_selection_rate

# Model training
vectorizer = CountVectorizer(max_features=150)
X = vectorizer.fit_transform(df['skills_text'])
y = df['is_successful']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluation
print(f"Training Accuracy: {model.score(X_train, y_train):.2f}")
print(f"Test Accuracy: {model.score(X_test, y_test):.2f}")

# Save the complete model package
joblib.dump({
    'model': model,
    'vectorizer': vectorizer,
    'median_selection_rate': median_selection_rate,
   
   
}, 'naive_bayes_predictor.pkl')

# Load the saved model after defining clean_skills
model_assets = joblib.load('naive_bayes_predictor.pkl')
model = model_assets['model']
vectorizer = model_assets['vectorizer']
median_selection_rate = model_assets['median_selection_rate']

def get_top_skills(df_filtered=None):
    """Get top skills from successful placements"""
    target_df = df_filtered if df_filtered is not None else df
    successful = target_df[target_df['is_successful']]
    all_skills = []
    for skills in successful['cleaned_skills']:
        all_skills.extend(skills)
    return [skill for skill, count in Counter(all_skills).most_common(10)]

def get_skills_gap(user_skills, required_skills):
    """Accurate skill gap detection that handles slash-separated groups"""
    missing = []
    user_skills_lower = [s.lower() for s in user_skills]
    
    for req_skill in required_skills:
        # Check if any sub-skill in the group is present
        if not any(sub_skill.strip().lower() in user_skills_lower
                  for sub_skill in req_skill.split('/')):
            missing.append(req_skill)
    return missing

def predict_job_readiness(user_skills, company=None, branch=None):
    """Final version with clear Yes/No first, then details"""
    # 1. Filter data
    target_df = df.copy()
    if company:
        target_df = target_df[target_df['COMPANY NAME'].str.strip().str.upper() == company.strip().upper()]
    if branch:
        target_df = target_df[target_df['ELIGIBLE BRANCH'].str.contains(branch, na=False)]
    
    if len(target_df) == 0:
        return {"error": "No matching records found"}
    
    # 2. Get requirements
    top_skills = get_top_skills(target_df)
    missing_skills = get_skills_gap(user_skills, top_skills)
    
    # 3. Strict Yes/No decision
    user_features = vectorizer.transform([' '.join(user_skills)])
    proba = model.predict_proba(user_features)[0][1]
    is_ready = (proba > median_selection_rate) and (len(missing_skills) == 0)
    
    # 4. Prepare output
    result = {
        'verdict': "YES" if is_ready else "NO",
        'probability': float(proba),
        'threshold': float(median_selection_rate),
        'missing_skills': missing_skills,
        'top_required_skills': top_skills
    }
    
    # 5. Generate explanation
    if is_ready:
        result['explanation'] = "Fully meets all skill requirements with high confidence"
    else:
        reasons = []
        if proba <= median_selection_rate:
            reasons.append(f"Probability ({proba:.0%}) below threshold ({median_selection_rate:.0%})")
        if missing_skills:
            reasons.append(f"Missing {len(missing_skills)} critical skills")
        result['explanation'] = " | ".join(reasons)
    
    return result

# Main execution
if __name__ == "__main__":
    # Test prediction
    user_skills = ['Python', 'SQL']
    result = predict_job_readiness(user_skills, company="MEDIA.NET")
    
    # Display results
    if 'error' in result:
        print(f"⚠️ {result['error']}")
    else:
        print(f"\nJOB READY? {result['verdict']}")
        print(f"🔍 Why: {result['explanation']}")
        
        if result['verdict'] == "NO":
            print("\n🔴 Missing Skills:")
            for i, skill in enumerate(result['missing_skills'], 1):
                print(f"{i}. {skill}")
            
            print("\n💡 Top Required Skills:")
            for i, skill in enumerate(result['top_required_skills'], 1):
                print(f"{i}. {skill}")


Training Accuracy: 0.76
Test Accuracy: 0.45

JOB READY? YES
🔍 Why: Fully meets all skill requirements with high confidence


In [60]:
import pandas as pd

# Load Excel
excel_path = "PlacementDataset.xlsx"
df = pd.read_excel(excel_path)

# Print column names
print("🔍 Actual column names in Excel:", df.columns)

🔍 Actual column names in Excel: Index(['SR NO.', 'COMPANY NAME', 'CATEGORY', 'JOB ROLE', 'PACKAGE',
       'ELIGIBLE BRANCH', 'NO.OF REGISTRATION', 'NO.OF PARTICIPANTS', 'skills',
       ' SELECTED NO. OF STUDENTS'],
      dtype='object')


In [62]:
import pandas as pd
import json

# Load your Excel sheet
excel_path = "PlacementDataset.xlsx"
df = pd.read_excel(excel_path)

# Rename columns based on your actual dataset
df = df.rename(columns={"COMPANY NAME": "COMPANY_NAME", "skills": "CLEANED_SKILLS"})

# Convert skills to lists (if they are stored as a single string)
df["CLEANED_SKILLS"] = df["CLEANED_SKILLS"].apply(lambda x: x.split(",") if isinstance(x, str) else [])

# Save to JSON
json_path = "/Users/sayyednigar/PLacement-TE/backend/placement_prediction.json"
df.to_json(json_path, orient="records", indent=4)

print(f"✅ JSON file saved at {json_path}")


✅ JSON file saved at /Users/sayyednigar/PLacement-TE/backend/placement_prediction.json
