In [None]:
!pip install pandas nltk




In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset
df = pd.read_csv("/content/processed_candidate_data.csv")

# Text Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)  # Remove special characters
    text = re.sub(r'\b(present|resume|worded)\b', '', text)  # Remove redundant words
    return text.strip()

df['Experience'] = df['Experience'].apply(clean_text)
df['Education'] = df['Education'].apply(clean_text)
df['Skills'] = df['Skills'].apply(clean_text)

# Feature Engineering
## TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=100)
experience_tfidf = tfidf.fit_transform(df['Experience']).toarray()
skills_tfidf = tfidf.fit_transform(df['Skills']).toarray()

## BERT Sentence Embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
df['Experience_Embeddings'] = df['Experience'].apply(lambda x: model.encode(x))
df['Skills_Embeddings'] = df['Skills'].apply(lambda x: model.encode(x))

# Encoding Categorical Variables
encoder = OneHotEncoder()
education_encoded = encoder.fit_transform(df[['Highest_Education']]).toarray()

df['Education_Encoded'] = list(education_encoded)

# Compute Cosine Similarity
def get_similarity(emb1, emb2):
    return cosine_similarity([emb1], [emb2])[0][0]

similarity_scores = [get_similarity(exp, skill) for exp, skill in zip(df['Experience_Embeddings'], df['Skills_Embeddings'])]
df['Experience_Skills_Similarity'] = similarity_scores

# Save processed data
df.to_csv("processed_dataset1.csv", index=False)
print("Preprocessing complete! File saved as processed_dataset1.csv")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Preprocessing complete! File saved as processed_dataset1.csv


In [None]:
encoder.categories_


[array(['MBA', 'MD', 'MFA', 'MS', 'Other', 'PhD'], dtype=object)]

In [None]:
tfidf.get_feature_names_out()


array(['3d', 'adobe', 'ads', 'analysis', 'astrophysics', 'autocad',
       'blogging', 'branding', 'cad', 'care', 'civil', 'cloud', 'color',
       'commercial', 'composition', 'computational', 'computing',
       'content', 'copywriting', 'creation', 'cryptography',
       'cybersecurity', 'data', 'deep', 'design', 'development',
       'digital', 'distributed', 'editing', 'emergency', 'engineering',
       'ethical', 'event', 'excel', 'figma', 'financial', 'firewalls',
       'google', 'hacking', 'image', 'investment', 'java', 'javascript',
       'learning', 'lightroom', 'linux', 'machine', 'management',
       'manufacturing', 'marketing', 'mathematical', 'matlab', 'media',
       'medical', 'medicine', 'modeling', 'network', 'optimization',
       'patient', 'photo', 'photography', 'photoshop', 'portrait',
       'probability', 'processes', 'project', 'python', 'pytorch',
       'react', 'research', 'retouching', 'risk', 'science', 'security',
       'seo', 'sketch', 'social', 'so

In [None]:
df[['Experience', 'Skills', 'Experience_Skills_Similarity']].head()


Unnamed: 0,Experience,Skills,Experience_Skills_Similarity
0,senior software engineer at google 2018present...,python c java machine learning distributed sys...,0.19013
1,data scientist at facebook 2019present machine...,data analysis deep learning tensorflow pytorch...,0.27128
2,marketing manager at tesla 2017present digital...,seo digital marketing google ads content strat...,0.283434
3,financial analyst at goldman sachs 2018present...,financial modeling investment analysis risk ma...,0.290488
4,cybersecurity engineer at cisco 2019present se...,cybersecurity network security ethical hacking...,0.418636


In [None]:
import pandas as pd

# Define employer dataset
employer_data = [
    {
        "Job_Title": "Senior WordPress Developer",
        "Company_Name": None,
        "Required_Skills": ["WordPress", "PHP", "HTML", "CSS", "JavaScript", "MySQL", "Git"],
        "Experience_Level": "1+ years",
        "Location": "Greater Noida",
        "Salary_Range": "₹5,00,000 - ₹6,00,000/year",
        "No_of_Openings": 1
    },
    {
        "Job_Title": "Graphic Designer",
        "Company_Name": None,
        "Required_Skills": ["Figma", "Adobe Illustrator", "Photoshop", "InDesign"],
        "Experience_Level": "1+ years",
        "Location": "Noida",
        "Salary_Range": "₹2,00,000 - ₹3,00,000/year",
        "No_of_Openings": 5
    },
    {
        "Job_Title": "Cinematographer",
        "Company_Name": "Bluntly Streaming",
        "Required_Skills": ["Video Editing", "Adobe Premiere Pro", "Photography"],
        "Experience_Level": "0-1 year",
        "Location": None,
        "Salary_Range": "₹2,00,000 - ₹3,00,000/year",
        "No_of_Openings": 2
    },
    {
        "Job_Title": "Financial Sales Officer",
        "Company_Name": None,
        "Required_Skills": ["Banking", "Sales", "Customer Service", "Communication"],
        "Experience_Level": "0-1 year",
        "Location": "Hyderabad",
        "Salary_Range": "₹4,00,000 - ₹4,30,000/year",
        "No_of_Openings": 20
    },
    {
        "Job_Title": "Associate Content Writer",
        "Company_Name": None,
        "Required_Skills": ["Blogging", "SEO", "Digital Marketing", "Social Media"],
        "Experience_Level": None,
        "Location": None,
        "Salary_Range": "₹2,00,000 - ₹4,00,000/year",
        "No_of_Openings": 1
    }
]

# Create DataFrame
df_employers = pd.DataFrame(employer_data)

# Save to CSV
df_employers.to_csv("employer_profiles.csv", index=False)

# Display DataFrame
print(df_employers)


                    Job_Title       Company_Name  \
0  Senior WordPress Developer               None   
1            Graphic Designer               None   
2             Cinematographer  Bluntly Streaming   
3     Financial Sales Officer               None   
4    Associate Content Writer               None   

                                     Required_Skills Experience_Level  \
0  [WordPress, PHP, HTML, CSS, JavaScript, MySQL,...         1+ years   
1    [Figma, Adobe Illustrator, Photoshop, InDesign]         1+ years   
2   [Video Editing, Adobe Premiere Pro, Photography]         0-1 year   
3  [Banking, Sales, Customer Service, Communication]         0-1 year   
4   [Blogging, SEO, Digital Marketing, Social Media]             None   

        Location                Salary_Range  No_of_Openings  
0  Greater Noida  ₹5,00,000 - ₹6,00,000/year               1  
1          Noida  ₹2,00,000 - ₹3,00,000/year               5  
2           None  ₹2,00,000 - ₹3,00,000/year               

In [None]:
import pandas as pd

# Load the employer dataset
df = pd.read_csv("employer_profiles.csv")

# Display the dataset
print(df.head())


                    Job_Title       Company_Name  \
0  Senior WordPress Developer                NaN   
1            Graphic Designer                NaN   
2             Cinematographer  Bluntly Streaming   
3     Financial Sales Officer                NaN   
4    Associate Content Writer                NaN   

                                     Required_Skills Experience_Level  \
0  ['WordPress', 'PHP', 'HTML', 'CSS', 'JavaScrip...         1+ years   
1  ['Figma', 'Adobe Illustrator', 'Photoshop', 'I...         1+ years   
2  ['Video Editing', 'Adobe Premiere Pro', 'Photo...         0-1 year   
3  ['Banking', 'Sales', 'Customer Service', 'Comm...         0-1 year   
4  ['Blogging', 'SEO', 'Digital Marketing', 'Soci...              NaN   

        Location                Salary_Range  No_of_Openings  
0  Greater Noida  ₹5,00,000 - ₹6,00,000/year               1  
1          Noida  ₹2,00,000 - ₹3,00,000/year               5  
2            NaN  ₹2,00,000 - ₹3,00,000/year               

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=100)

# Use df_employers instead of df
job_desc_vectors = tfidf.fit_transform(df_employers['Job_Title'] + " " + df_employers['Required_Skills'].astype(str))

# Convert to array
job_desc_vectors = job_desc_vectors.toarray()

# Print TF-IDF feature shape
print("TF-IDF Matrix Shape:", job_desc_vectors.shape)

TF-IDF Matrix Shape: (5, 38)


In [None]:
!pip install --upgrade torch torchvision torchaudio
!pip install --upgrade transformers sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer

# Load BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for job descriptions
df['Job_Description_Embeddings'] = df['Job_Title'].apply(lambda x: model.encode(x))

# Display embeddings for the first job
print(df['Job_Description_Embeddings'].iloc[0])


[-2.92310584e-02  1.95619892e-02  2.00619362e-02 -9.11372527e-03
  8.54548346e-03 -5.70700355e-02 -4.87989448e-02 -1.05980167e-03
 -7.70015121e-02  4.24553007e-02  7.65774678e-03  1.14710487e-01
 -3.83466203e-03 -1.89601928e-02  9.20688082e-03  1.08957373e-01
 -7.93884769e-02 -1.92994699e-02 -1.03915492e-02 -8.79531428e-02
 -6.70653433e-02 -2.11091209e-02 -3.48315053e-02 -9.93930083e-03
  2.68426929e-02  2.30772560e-03 -6.80900440e-02  1.88400671e-02
  5.73703460e-02 -1.89764127e-02 -1.01767518e-02  2.10754108e-03
  1.02032080e-01  4.12834808e-02 -3.24291512e-02  7.39795864e-02
  5.50418422e-02 -4.88193221e-02  4.06972580e-02  1.11442842e-02
 -1.16344139e-01 -6.63891854e-03 -6.86085075e-02 -1.38986679e-02
  1.95646062e-02 -1.15637578e-01 -8.24642275e-03 -7.63947219e-02
 -1.96798444e-02 -1.56253763e-02  3.73830236e-02 -1.29071638e-01
  5.29656000e-03 -4.81227189e-02 -6.72207847e-02 -3.18651237e-02
  2.52709500e-02 -9.67997685e-03  6.72248658e-03 -9.44394469e-02
 -6.91376105e-02  1.63962

In [None]:
import numpy as np

# Convert embeddings into a NumPy array and save
np.save("tfidf_employer_vectors.npy", job_desc_vectors)
df.to_csv("vectorized_employer_profiles.csv", index=False)


In [None]:
import pandas as pd
import numpy as np

# Load processed candidate dataset
df_candidates = pd.read_csv("processed_dataset1.csv")

# Check column names
print(df_candidates.columns)


Index(['ID', 'Experience', 'Education', 'Skills', 'Total_Experience_Years',
       'Highest_Education', 'Number_of_Skills', 'Experience_Embeddings',
       'Skills_Embeddings', 'Education_Encoded',
       'Experience_Skills_Similarity'],
      dtype='object')


In [None]:
import numpy as np

# Convert stored embeddings from space-separated string to NumPy arrays
def fix_embedding_format(embedding_str):
    try:
        return np.array([float(num) for num in embedding_str.strip('[]').split()])
    except:
        return np.nan  # Handle errors gracefully

df_candidates["Experience_Embeddings"] = df_candidates["Experience_Embeddings"].apply(fix_embedding_format)
df_candidates["Skills_Embeddings"] = df_candidates["Skills_Embeddings"].apply(fix_embedding_format)



In [None]:
print(df_candidates["Experience_Embeddings"].head(3))
print(df_candidates["Skills_Embeddings"].head(3))


0    [-0.0845896527, 0.0413818918, 0.0990628302, -0...
1    [-0.0711975247, 0.0238496922, -0.0124800066, 0...
2    [-0.0551201664, -0.0644753352, -0.0379419476, ...
Name: Experience_Embeddings, dtype: object
0    [-0.0835608169, -0.01966618, -0.0136221759, 0....
1    [-0.0493820086, -0.0978562236, 0.0182174351, 0...
2    [-0.0556748062, 0.0138439676, 0.00485280808, -...
Name: Skills_Embeddings, dtype: object


In [None]:
experience_vectors = np.vstack(df_candidates["Experience_Embeddings"].dropna().values)
skills_vectors = np.vstack(df_candidates["Skills_Embeddings"].dropna().values)


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load processed candidate dataset
df_candidates = pd.read_csv("processed_dataset1.csv")

# Convert candidate embeddings back to NumPy arrays
df_candidates["Experience_Embeddings"] = df_candidates["Experience_Embeddings"].apply(lambda x: np.array([float(num) for num in x.strip('[]').split()]) if isinstance(x, str) else np.nan)
df_candidates["Skills_Embeddings"] = df_candidates["Skills_Embeddings"].apply(lambda x: np.array([float(num) for num in x.strip('[]').split()]) if isinstance(x, str) else np.nan)

# Load employer job dataset (replace with actual path)
df_jobs = pd.read_csv("/content/vectorized_employer_profiles.csv")

# Convert job description embeddings back to NumPy arrays
df_jobs["Job_Description_Embeddings"] = df_jobs["Job_Description_Embeddings"].apply(lambda x: np.array([float(num) for num in x.strip('[]').split()]) if isinstance(x, str) else np.nan)


In [None]:
# Function to compute similarity between candidate and job
def get_similarity(cand_emb, job_emb):
    if cand_emb is not None and job_emb is not None:
        return cosine_similarity([cand_emb], [job_emb])[0][0]
    return 0  # If any embedding is missing, return 0 similarity

# Create a list to store matches
matches = []

# Iterate over job postings
for job_idx, job_row in df_jobs.iterrows():
    job_embedding = job_row["Job_Description_Embeddings"]

    for cand_idx, cand_row in df_candidates.iterrows():
        exp_similarity = get_similarity(cand_row["Experience_Embeddings"], job_embedding)
        skills_similarity = get_similarity(cand_row["Skills_Embeddings"], job_embedding)

        # Final score (weighted combination)
        final_score = (0.6 * exp_similarity) + (0.4 * skills_similarity)  # Adjust weights if needed

        matches.append({
            "Job_Title": job_row["Job_Title"],
            "Candidate_ID": cand_row["ID"],  # Replace with actual identifier
            "Experience_Similarity": exp_similarity,
            "Skills_Similarity": skills_similarity,
            "Final_Score": final_score
        })

# Convert to DataFrame
df_matches = pd.DataFrame(matches)

# Sort candidates for each job by highest similarity score
df_matches = df_matches.sort_values(by=["Job_Title", "Final_Score"], ascending=[True, False])

# Save the results
df_matches.to_csv("candidate_job_matches.csv", index=False)
print("✅ Candidate-job matching completed! Results saved as candidate_job_matches.csv")


✅ Candidate-job matching completed! Results saved as candidate_job_matches.csv


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load datasets
candidates_df = pd.read_csv("processed_dataset1.csv")
job_matches_df = pd.read_csv("candidate_job_matches.csv")
jobs_df = pd.read_csv("vectorized_employer_profiles.csv")

# Function to calculate Education Match
def education_match(candidate_edu, job_edu):
    education_levels = {"High School": 1, "Associate": 2, "Bachelor": 3, "Master": 4, "PhD": 5}
    candidate_level = education_levels.get(str(candidate_edu).strip(), 0)
    job_level = education_levels.get(str(job_edu).strip(), 0)
    return 1 if candidate_level >= job_level else 0

# Function to calculate Work Location Match
def location_match(candidate_location, job_location):
    if pd.isna(candidate_location) or pd.isna(job_location):
        return 0  # No match if any value is missing
    return 1 if str(candidate_location).strip().lower() == str(job_location).strip().lower() else 0

# Function to calculate Salary Match
def salary_match(candidate_salary, job_salary_range):
    if pd.isna(candidate_salary) or pd.isna(job_salary_range):
        return 0  # No match if salary info is missing
    try:
        job_salary_range = job_salary_range.replace("₹", "").replace(",", "").split(" - ")
        job_min_salary, job_max_salary = map(int, job_salary_range)
        return 1 if job_min_salary <= int(candidate_salary) <= job_max_salary else 0
    except ValueError:
        return 0  # If parsing fails, assume no match

# Normalize scores
def normalize_scores(df, columns):
    scaler = MinMaxScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df

# Merge job and candidate data
final_matches = job_matches_df.merge(candidates_df, left_on="Candidate_ID", right_on="ID", how="left")
final_matches = final_matches.merge(jobs_df, on="Job_Title", how="left")

# Compute additional matching parameters
final_matches["Education_Match"] = final_matches.apply(
    lambda row: education_match(row["Highest_Education"], "Bachelor"), axis=1)  # Placeholder job edu level

final_matches["Work_Location_Match"] = final_matches.apply(
    lambda row: location_match("Delhi", row["Location"]), axis=1)  # Placeholder candidate location

final_matches["Salary_Match"] = final_matches.apply(
    lambda row: salary_match(500000, row["Salary_Range"]), axis=1)  # Placeholder salary

# Calculate Final Weighted Score
weights = {
    "Experience_Similarity": 0.3,
    "Skills_Similarity": 0.3,
    "Education_Match": 0.2,
    "Work_Location_Match": 0.1,
    "Salary_Match": 0.1
}

final_matches["Final_Score"] = (
    final_matches["Experience_Similarity"] * weights["Experience_Similarity"] +
    final_matches["Skills_Similarity"] * weights["Skills_Similarity"] +
    final_matches["Education_Match"] * weights["Education_Match"] +
    final_matches["Work_Location_Match"] * weights["Work_Location_Match"] +
    final_matches["Salary_Match"] * weights["Salary_Match"]
)

# Normalize Final Score
final_matches = normalize_scores(final_matches, ["Final_Score"])

# Save the updated matches
def save_results(df, filename):
    df.to_csv(f"{filename}", index=False)

save_results(final_matches, "final_candidate_job_matches.csv")

# Display first few rows
final_matches.head()



Unnamed: 0,Job_Title,Candidate_ID,Experience_Similarity,Skills_Similarity,Final_Score,ID,Experience,Education,Skills,Total_Experience_Years,...,Company_Name,Required_Skills,Experience_Level,Location,Salary_Range,No_of_Openings,Job_Description_Embeddings,Education_Match,Work_Location_Match,Salary_Match
0,Associate Content Writer,PERSON8,0.504431,0.399612,0.748102,PERSON8,lead content writer at twitter 2019present seo...,ma english new media institute ba communic...,content creation blogging seo optimization soc...,10,...,,"['Blogging', 'SEO', 'Digital Marketing', 'Soci...",,,"₹2,00,000 - ₹4,00,000/year",1,[-4.72556502e-02 -9.65758786e-02 -4.35541011e-...,0,0,0
1,Associate Content Writer,PERSON3,0.319047,0.052286,0.249021,PERSON3,marketing manager at tesla 2017present digital...,mba harvard ba marketing university of penns...,seo digital marketing google ads content strat...,11,...,,"['Blogging', 'SEO', 'Digital Marketing', 'Soci...",,,"₹2,00,000 - ₹4,00,000/year",1,[-4.72556502e-02 -9.65758786e-02 -4.35541011e-...,0,0,0
2,Associate Content Writer,PERSON11,0.22412,0.192845,0.291772,PERSON11,photographer at adobe 2019present junior photo...,bfa photography university certified profes...,photo editing image retouching visual storytel...,9,...,,"['Blogging', 'SEO', 'Digital Marketing', 'Soci...",,,"₹2,00,000 - ₹4,00,000/year",1,[-4.72556502e-02 -9.65758786e-02 -4.35541011e-...,0,0,0
3,Associate Content Writer,PERSON2,0.314788,0.013474,0.833252,PERSON2,data scientist at facebook 2019present machine...,phd data science berkeley ms ai carnegie mel...,data analysis deep learning tensorflow pytorch...,10,...,,"['Blogging', 'SEO', 'Digital Marketing', 'Soci...",,,"₹2,00,000 - ₹4,00,000/year",1,[-4.72556502e-02 -9.65758786e-02 -4.35541011e-...,1,0,0
4,Associate Content Writer,PERSON10,0.15352,0.213025,0.244535,PERSON10,photographer at coachedcom 2020present freelan...,bfa photography university cpp certificatio...,studio photography portrait photography editin...,10,...,,"['Blogging', 'SEO', 'Digital Marketing', 'Soci...",,,"₹2,00,000 - ₹4,00,000/year",1,[-4.72556502e-02 -9.65758786e-02 -4.35541011e-...,0,0,0


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load datasets
candidates_df = pd.read_csv("processed_dataset1.csv")
job_matches_df = pd.read_csv("candidate_job_matches.csv")
jobs_df = pd.read_csv("vectorized_employer_profiles.csv")

# Function to calculate Education Match
def education_match(candidate_edu, job_edu):
    education_levels = {"High School": 1, "Associate": 2, "Bachelor": 3, "Master": 4, "PhD": 5}
    candidate_level = education_levels.get(candidate_edu, 0)
    job_level = education_levels.get(job_edu, 0)
    return 1 if candidate_level >= job_level else 0

# Function to calculate Work Location Match
def location_match(candidate_location, job_location):
    if isinstance(candidate_location, str) and isinstance(job_location, str):
        return 1 if candidate_location.strip().lower() == job_location.strip().lower() else 0
    return 0

# Function to calculate Salary Match
def salary_match(candidate_salary, job_salary_range):
    try:
        job_salary_range = str(job_salary_range).replace("₹", "").replace(",", "").split(" - ")
        job_min_salary, job_max_salary = map(int, job_salary_range)
        return 1 if job_min_salary <= candidate_salary <= job_max_salary else 0
    except:
        return 0

# Normalize scores
def normalize_scores(df, columns):
    scaler = MinMaxScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df

# Merge job and candidate data
final_matches = job_matches_df.merge(candidates_df, left_on="Candidate_ID", right_on="ID")
final_matches = final_matches.merge(jobs_df, on="Job_Title")

# Compute additional matching parameters
final_matches["Education_Match"] = final_matches.apply(lambda row: education_match(row["Highest_Education"], "Bachelor"), axis=1)  # Placeholder job education level
final_matches["Work_Location_Match"] = final_matches.apply(lambda row: location_match(row.get("Location", ""), "Delhi"), axis=1)  # Placeholder candidate location
final_matches["Salary_Match"] = final_matches.apply(lambda row: salary_match(500000, row.get("Salary_Range", "")), axis=1)  # Placeholder salary

# Define Weights
weights = {
    "Experience_Similarity": 0.3,
    "Skills_Similarity": 0.3,
    "Education_Match": 0.2,
    "Work_Location_Match": 0.1,
    "Salary_Match": 0.1,
}

# Assign Weights Column
final_matches["Weighted_Experience"] = final_matches["Experience_Similarity"] * weights["Experience_Similarity"]
final_matches["Weighted_Skills"] = final_matches["Skills_Similarity"] * weights["Skills_Similarity"]
final_matches["Weighted_Education"] = final_matches["Education_Match"] * weights["Education_Match"]
final_matches["Weighted_Location"] = final_matches["Work_Location_Match"] * weights["Work_Location_Match"]
final_matches["Weighted_Salary"] = final_matches["Salary_Match"] * weights["Salary_Match"]

# Calculate Final Score
final_matches["Final_Score"] = (
    final_matches["Weighted_Experience"] +
    final_matches["Weighted_Skills"] +
    final_matches["Weighted_Education"] +
    final_matches["Weighted_Location"] +
    final_matches["Weighted_Salary"]
)

# Normalize Final Score
final_matches = normalize_scores(final_matches, ["Final_Score"])

# Compute Fit Percentage
final_matches["Fit_Percentage"] = final_matches["Final_Score"] * 100

# Rank Candidates for Each Job
final_matches["Rank"] = final_matches.groupby("Job_Title")["Final_Score"].rank(ascending=False, method='dense')

# Save the updated matches
def save_results(df, filename):
    df.to_csv(f"{filename}", index=False)

save_results(final_matches, "ranked_candidate_job_matches.csv")

# Display first few rows
final_matches.head()


Unnamed: 0,Job_Title,Candidate_ID,Experience_Similarity,Skills_Similarity,Final_Score,ID,Experience,Education,Skills,Total_Experience_Years,...,Education_Match,Work_Location_Match,Salary_Match,Weighted_Experience,Weighted_Skills,Weighted_Education,Weighted_Location,Weighted_Salary,Fit_Percentage,Rank
0,Associate Content Writer,PERSON8,0.504431,0.399612,0.748102,PERSON8,lead content writer at twitter 2019present seo...,ma english new media institute ba communic...,content creation blogging seo optimization soc...,10,...,0,0,0,0.151329,0.119883,0.0,0.0,0.0,74.810201,2.0
1,Associate Content Writer,PERSON3,0.319047,0.052286,0.249021,PERSON3,marketing manager at tesla 2017present digital...,mba harvard ba marketing university of penns...,seo digital marketing google ads content strat...,11,...,0,0,0,0.095714,0.015686,0.0,0.0,0.0,24.902107,6.0
2,Associate Content Writer,PERSON11,0.22412,0.192845,0.291772,PERSON11,photographer at adobe 2019present junior photo...,bfa photography university certified profes...,photo editing image retouching visual storytel...,9,...,0,0,0,0.067236,0.057853,0.0,0.0,0.0,29.177229,5.0
3,Associate Content Writer,PERSON2,0.314788,0.013474,0.833252,PERSON2,data scientist at facebook 2019present machine...,phd data science berkeley ms ai carnegie mel...,data analysis deep learning tensorflow pytorch...,10,...,1,0,0,0.094437,0.004042,0.2,0.0,0.0,83.325163,1.0
4,Associate Content Writer,PERSON10,0.15352,0.213025,0.244535,PERSON10,photographer at coachedcom 2020present freelan...,bfa photography university cpp certificatio...,studio photography portrait photography editin...,10,...,0,0,0,0.046056,0.063907,0.0,0.0,0.0,24.453513,7.0


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load datasets
candidates_df = pd.read_csv("processed_dataset1.csv")
job_matches_df = pd.read_csv("candidate_job_matches.csv")
jobs_df = pd.read_csv("vectorized_employer_profiles.csv")

# Function to calculate Education Match
def education_match(candidate_edu, job_edu):
    education_levels = {"High School": 1, "Associate": 2, "Bachelor": 3, "Master": 4, "PhD": 5}
    candidate_level = education_levels.get(candidate_edu, 0)
    job_level = education_levels.get(job_edu, 0)
    return 1 if candidate_level >= job_level else 0

# Function to calculate Work Location Match
def location_match(candidate_location, job_location):
    if isinstance(candidate_location, str) and isinstance(job_location, str):
        return 1 if candidate_location.strip().lower() == job_location.strip().lower() else 0
    return 0

# Normalize scores
def normalize_scores(df, columns):
    scaler = MinMaxScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df

# Merge job and candidate data
final_matches = job_matches_df.merge(candidates_df, left_on="Candidate_ID", right_on="ID")
final_matches = final_matches.merge(jobs_df, on="Job_Title")

# Compute additional matching parameters
final_matches["Education_Match"] = final_matches.apply(lambda row: education_match(row["Highest_Education"], "Bachelor"), axis=1)  # Placeholder
final_matches["Work_Location_Match"] = final_matches.apply(lambda row: location_match(row["Location"], "Delhi"), axis=1)  # Placeholder

# Normalize similarity scores before weighting
final_matches = normalize_scores(final_matches, ["Experience_Similarity", "Skills_Similarity"])

# Define dynamic weights
weights = {
    "Experience_Similarity": 0.35,
    "Skills_Similarity": 0.35,
    "Education_Match": 0.15,
    "Work_Location_Match": 0.15
}

# Calculate Final Weighted Score
final_matches["Final_Score"] = (
    final_matches["Experience_Similarity"] * weights["Experience_Similarity"] +
    final_matches["Skills_Similarity"] * weights["Skills_Similarity"] +
    final_matches["Education_Match"] * weights["Education_Match"] +
    final_matches["Work_Location_Match"] * weights["Work_Location_Match"]
)

# Normalize Final Score
final_matches = normalize_scores(final_matches, ["Final_Score"])

# Calculate Fit Percentage
final_matches["Fit_Percentage"] = (final_matches["Final_Score"] * 100).round(2)

# Rank candidates per job
def rank_candidates(df):
    return df.sort_values(by=["Final_Score"], ascending=False).reset_index(drop=True)

final_matches = final_matches.groupby("Job_Title", group_keys=False).apply(rank_candidates)
final_matches["Rank"] = final_matches.groupby("Job_Title")["Final_Score"].rank(ascending=False, method="dense")

# Save the updated matches
def save_results(df, filename):
    df.to_csv(f"{filename}", index=False)

save_results(final_matches, "Take3.csv")

# Display first few rows
final_matches.head()



  final_matches = final_matches.groupby("Job_Title", group_keys=False).apply(rank_candidates)


Unnamed: 0,Job_Title,Candidate_ID,Experience_Similarity,Skills_Similarity,Final_Score,ID,Experience,Education,Skills,Total_Experience_Years,...,Required_Skills,Experience_Level,Location,Salary_Range,No_of_Openings,Job_Description_Embeddings,Education_Match,Work_Location_Match,Fit_Percentage,Rank
0,Associate Content Writer,PERSON8,0.949925,0.868208,1.0,PERSON8,lead content writer at twitter 2019present seo...,ma english new media institute ba communic...,content creation blogging seo optimization soc...,10,...,"['Blogging', 'SEO', 'Digital Marketing', 'Soci...",,,"₹2,00,000 - ₹4,00,000/year",1,[-4.72556502e-02 -9.65758786e-02 -4.35541011e-...,0,0,100.0,1.0
1,Associate Content Writer,PERSON2,0.565632,0.200606,0.580462,PERSON2,data scientist at facebook 2019present machine...,phd data science berkeley ms ai carnegie mel...,data analysis deep learning tensorflow pytorch...,10,...,"['Blogging', 'SEO', 'Digital Marketing', 'Soci...",,,"₹2,00,000 - ₹4,00,000/year",1,[-4.72556502e-02 -9.65758786e-02 -4.35541011e-...,1,0,58.05,2.0
2,Associate Content Writer,PERSON11,0.381901,0.510724,0.377072,PERSON11,photographer at adobe 2019present junior photo...,bfa photography university certified profes...,photo editing image retouching visual storytel...,9,...,"['Blogging', 'SEO', 'Digital Marketing', 'Soci...",,,"₹2,00,000 - ₹4,00,000/year",1,[-4.72556502e-02 -9.65758786e-02 -4.35541011e-...,0,0,37.71,3.0
3,Associate Content Writer,PERSON3,0.574261,0.26771,0.342978,PERSON3,marketing manager at tesla 2017present digital...,mba harvard ba marketing university of penns...,seo digital marketing google ads content strat...,11,...,"['Blogging', 'SEO', 'Digital Marketing', 'Soci...",,,"₹2,00,000 - ₹4,00,000/year",1,[-4.72556502e-02 -9.65758786e-02 -4.35541011e-...,0,0,34.3,4.0
4,Associate Content Writer,PERSON7,0.237678,0.159299,0.331925,PERSON7,research scientist at nasa 2019present astroph...,phd astrophysics caltech ms physics mit bs a...,astrophysics computational modeling data scien...,10,...,"['Blogging', 'SEO', 'Digital Marketing', 'Soci...",,,"₹2,00,000 - ₹4,00,000/year",1,[-4.72556502e-02 -9.65758786e-02 -4.35541011e-...,1,0,33.19,5.0


In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd

# Load the data (adjust the path if necessary)
df = pd.read_csv("/content/Take3.csv")

# Set page title
st.title("Candidate Recommendations")

# Sidebar for filtering
st.sidebar.header("Filters")

# Filter by job title
job_titles = df["Job_Title"].unique()
selected_job_title = st.sidebar.selectbox("Select Job Title:", job_titles)

# Filter by minimum fit percentage
min_fit_percentage = st.sidebar.slider("Minimum Fit Percentage:", 0, 100, 0)

# Filter data based on selected options
filtered_df = df[
    (df["Job_Title"] == selected_job_title) & (df["Fit_Percentage"] >= min_fit_percentage)
]

# Display employers and ranked candidates
st.subheader("Recommended Candidates:")

# Group the filtered DataFrame by Job_Title
for job_title, group in filtered_df.groupby("Job_Title"):
    st.markdown(f"### Job Title: {job_title}")

    # Prepare a smaller DataFrame for display
    display_df = group[["Rank", "Candidate_ID", "Fit_Percentage"]].copy()

    # Format the Fit_Percentage as a percentage with 2 decimals
    display_df["Fit_Percentage"] = display_df["Fit_Percentage"].apply(lambda x: f"{x:.2f}%")

    # Display as a table
    st.table(display_df)


In [None]:
from pyngrok import ngrok
import time

# 1) Start ngrok tunnel on port 8501
public_url = ngrok.connect(8501)
print("Public URL:", public_url)

# 2) Run Streamlit in the background
!streamlit run app.py &

# 3) Wait a few seconds for Streamlit to spin up
time.sleep(5)

print("If the URL above isn't working right away, wait a bit and then try refreshing.")
