In [1]:
import pandas as pd
import numpy as np
import random
from sentence_transformers import SentenceTransformer
import faiss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

# Simulate a larger dataset (500 rows) based on the 14 rows provided
np.random.seed(42)
categories = ["legal", "engineering", "marketing", "finance", "design", "writing", "programming"]
skills_dict = {
    "legal": ["Tax Law", "Intellectual Property", "Contract Law", "Corporate Law", "Employment Law"],
    "engineering": ["CAD Modeling", "Mechanical Design", "Civil Engineering", "Automation", "Electrical Systems"],
    "marketing": ["Social Media Marketing", "Google Ads", "Email Marketing", "SEO", "Brand Strategy"],
    "finance": ["Accounting", "Investment Analysis", "Tax Compliance", "Financial Modeling"],
    "design": ["Graphic Design", "UI/UX Design", "3D Modeling", "Illustration", "Logo Design"],
    "writing": ["Copywriting", "Scriptwriting", "Ghostwriting", "SEO Writing"],
    "programming": ["Python", "Machine Learning", "Java", "React", "Django"]
}

# Generate project descriptions
def generate_project(category, skills):
    industry = random.choice(["e-commerce", "tech startup", "legal firm", "manufacturing firm", "corporate client", "startup", "publication", "online store"])
    skill = random.choice(skills.split(", "))
    if category == "legal":
        return f"Handled a {skill}-specific case for a {industry}."
    elif category == "engineering":
        return f"Designed a {skill}-oriented prototype for a {industry}."
    elif category == "marketing":
        return f"Developed a {skill}-driven campaign for an {industry}."
    elif category == "finance":
        return f"Provided {skill}-based consultancy for a {industry}."
    elif category == "design":
        return f"Created a {skill}-based design concept for a {industry}."
    elif category == "writing":
        return f"Wrote a {skill}-focused series of articles for a {industry}."
    else:  # programming
        return f"Built a project using {skill} for a {industry}."

# Create the dataset
data = []
for i in range(1, 501):
    category = random.choice(categories)
    skills = ", ".join(random.sample(skills_dict[category], random.randint(2, 4)))
    data.append({
        "freelancer_id": f"FR{i:06d}",
        "skills": skills,
        "main_category": category,
        "experience_years": random.randint(5, 30),
        "hourly_rate": round(random.uniform(15, 300), 2),
        "rating": round(random.uniform(3.5, 5.0), 1),
        "jobs_completed": random.randint(10, 500),
        "success_rate": round(random.uniform(0.7, 0.98), 2),
        "response_time_hours": round(random.uniform(5, 50), 1),
        "on_time_delivery_rate": round(random.uniform(0.7, 0.95), 2),
        "average_client_reviews": round(random.uniform(50, 900), 2),
        "availability": random.randint(10, 600),
        "revision_rate": round(random.uniform(0.1, 3.0), 1),
        "projects": generate_project(category, skills)
    })

df = pd.DataFrame(data)
print("Dataset Shape:", df.shape)
df.head()

  _torch_pytree._register_pytree_node(


Dataset Shape: (500, 14)


Unnamed: 0,freelancer_id,skills,main_category,experience_years,hourly_rate,rating,jobs_completed,success_rate,response_time_hours,on_time_delivery_rate,average_client_reviews,availability,revision_rate,projects
0,FR000001,"Google Ads, Social Media Marketing",marketing,13,274.72,3.6,222,0.76,26.2,0.88,214.06,59,2.7,Developed a Google Ads-driven campaign for an ...
1,FR000002,"Tax Compliance, Investment Analysis",finance,7,120.07,4.4,14,0.73,5.2,0.74,871.91,587,0.2,Provided Investment Analysis-based consultancy...
2,FR000003,"Contract Law, Tax Law, Intellectual Property, ...",legal,9,40.98,3.8,10,0.94,45.1,0.82,715.7,57,1.4,Handled a Contract Law-specific case for a onl...
3,FR000004,"CAD Modeling, Automation, Mechanical Design, E...",engineering,25,32.12,3.7,93,0.73,33.3,0.9,744.29,268,2.4,Designed a CAD Modeling-oriented prototype for...
4,FR000005,"UI/UX Design, Logo Design, 3D Modeling",design,28,148.02,4.9,444,0.81,38.4,0.94,524.65,561,1.1,Created a 3D Modeling-based design concept for...


In [2]:
# Combine text fields into a single profile text
df["profile_text"] = df.apply(lambda row: f"Skills: {row['skills']}; Category: {row['main_category']}; Projects: {row['projects']}", axis=1)

# Simulate client projects by modifying the projects column
client_projects = []
ground_truth = []
for idx, row in df.iterrows():
    project = row["projects"]
    # Slightly modify the project description
    modified_project = project.replace("Handled", "Looking for a freelancer to handle").replace("Designed", "Need a prototype with").replace("Developed", "Require").replace("Provided", "Seeking").replace("Created", "Require").replace("Wrote", "Require").replace("Built", "Seeking")
    client_projects.append(modified_project)
    ground_truth.append(idx)  # Ground truth: the freelancer who did this project

# Generate embeddings using Sentence-BERT
model = SentenceTransformer('all-MiniLM-L6-v2')
freelancer_embeddings = model.encode(df["profile_text"].tolist(), show_progress_bar=True)
client_embeddings = model.encode(client_projects, show_progress_bar=True)

print("Freelancer Embeddings Shape:", freelancer_embeddings.shape)
print("Client Embeddings Shape:", client_embeddings.shape)

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Freelancer Embeddings Shape: (500, 384)
Client Embeddings Shape: (500, 384)


In [3]:
# Build FAISS index
d = freelancer_embeddings.shape[1]  # Dimension of embeddings
index = faiss.IndexIVFFlat(faiss.IndexFlatL2(d), d, 50)  # 50 clusters for IVF
index.train(freelancer_embeddings)
index.add(freelancer_embeddings)
index.nprobe = 10  # Number of clusters to search

# Search for top-k matches
k = 10
distances, indices = index.search(client_embeddings, k)

# Calculate baseline FAISS performance
faiss_precision_at_5 = []
faiss_mrr = 0.0
for i, idx in enumerate(indices):
    true_idx = i
    recommended = idx.tolist()
    # Precision@5
    precision = 1.0 if true_idx in recommended[:5] else 0.0
    faiss_precision_at_5.append(precision)
    # MRR
    rank = np.where(idx == true_idx)[0]
    if len(rank) > 0:
        faiss_mrr += 1.0 / (rank[0] + 1)

faiss_precision_at_5 = np.mean(faiss_precision_at_5)
faiss_mrr /= len(indices)
print(f"FAISS Baseline - Precision@5: {faiss_precision_at_5:.3f}")
print(f"FAISS Baseline - MRR: {faiss_mrr:.3f}")

FAISS Baseline - Precision@5: 0.958
FAISS Baseline - MRR: 0.620


In [6]:
# Create a dataset of (client_project, freelancer) pairs
X_pairs = []
y_pairs = []
numerical_features = ["experience_years", "hourly_rate", "rating", "jobs_completed", "success_rate", "response_time_hours", "on_time_delivery_rate", "average_client_reviews", "availability", "revision_rate"]
categorical_features = ["main_category"]

for client_idx in range(len(client_projects)):
    for freelancer_idx in range(len(df)):
        # Features: Concatenate client embedding, freelancer embedding, and freelancer numerical features
        client_emb = client_embeddings[client_idx]
        freelancer_emb = freelancer_embeddings[freelancer_idx]
        numerical_vals = df[numerical_features].iloc[freelancer_idx].values
        categorical_vals = df[categorical_features].iloc[freelancer_idx].values
        pair_features = np.concatenate([client_emb, freelancer_emb, numerical_vals])
        X_pairs.append(pair_features)
        # Label: 1 if this freelancer is the ground truth match, 0 otherwise
        y_pairs.append(1 if freelancer_idx == ground_truth[client_idx] else 0)

X_pairs = np.array(X_pairs)
y_pairs = np.array(y_pairs)

# Handle class imbalance (many more negatives than positives)
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_pairs, y_pairs = smote.fit_resample(X_pairs, y_pairs)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_pairs, y_pairs, test_size=0.2, random_state=42)

# Preprocess numerical features
scaler = StandardScaler()
X_train_numerical = scaler.fit_transform(X_train[:, -len(numerical_features):])
X_test_numerical = scaler.transform(X_test[:, -len(numerical_features):])
X_train[:, -len(numerical_features):] = X_train_numerical
X_test[:, -len(numerical_features):] = X_test_numerical

print("Training Data Shape:", X_train.shape)
print("Test Data Shape:", X_test.shape)

Training Data Shape: (399200, 778)
Test Data Shape: (99800, 778)


In [5]:
pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn->imblearn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.13.0 imblearn-0.0 sklearn-compat-0.1.3
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Dictionary to store results
results = {}

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
results["Logistic Regression"] = {
    "accuracy": accuracy_score(y_test, lr_pred),
    "f1": f1_score(y_test, lr_pred),
    "precision": precision_score(y_test, lr_pred),
    "recall": recall_score(y_test, lr_pred),
    "confusion_matrix": confusion_matrix(y_test, lr_pred)
}



In [None]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
results["Random Forest"] = {
    "accuracy": accuracy_score(y_test, rf_pred),
    "f1": f1_score(y_test, rf_pred),
    "precision": precision_score(y_test, rf_pred),
    "recall": recall_score(y_test, rf_pred),
    "confusion_matrix": confusion_matrix(y_test, rf_pred)
}

# SVM
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
results["SVM"] = {
    "accuracy": accuracy_score(y_test, svm_pred),
    "f1": f1_score(y_test, svm_pred),
    "precision": precision_score(y_test, svm_pred),
    "recall": recall_score(y_test, svm_pred),
    "confusion_matrix": confusion_matrix(y_test, svm_pred)
}

# Neural Network
nn_model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')
])
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=0)
nn_pred = (nn_model.predict(X_test) > 0.5).astype(int).flatten()
results["Neural Network"] = {
    "accuracy": accuracy_score(y_test, nn_pred),
    "f1": f1_score(y_test, nn_pred),
    "precision": precision_score(y_test, nn_pred),
    "recall": recall_score(y_test, nn_pred),
    "confusion_matrix": confusion_matrix(y_test, nn_pred)
}

# Print results
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    print(f"Accuracy: {metrics['accuracy']:.3f}")
    print(f"F1 Score: {metrics['f1']:.3f}")
    print(f"Precision: {metrics['precision']:.3f}")
    print(f"Recall: {metrics['recall']:.3f}")