In [None]:
!pip install datasets
!pip install gensim
# !pip install transformers[torch] accelerate datasets -U
# !pip install accelerate -U

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import numpy as np

In [None]:
# Import our dataset
df_train = pd.read_csv("final_gpt_match_output_clean_res_20240727213717.csv")
df_test = pd.read_csv("holdout_gpt_match_output_checkpoint_40_20240727235744.csv")

# df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

label_mapping = {
    'poor fit': 0,
    'good fit': 1,
    'Good Fit': 1,
    'No Fit': 0,
    'Potential Fit': 0
}

df_train['label'] = df_train['label'].map(label_mapping)
df_test['label'] = df_test['label'].map(label_mapping)

min_class_size = df_train['label'].value_counts().min()
df_train = df_train.groupby('label').apply(lambda x: x.sample(min_class_size, random_state=42)).reset_index(drop=True)

In [None]:
# Tokenize the text data
def tokenize(text):
    return text.split()

df_train['resume_tokens'] = df_train['resume_skills'].apply(tokenize)
df_train['jd_tokens'] = df_train['job_desc'].apply(tokenize)

df_test['resume_tokens'] = df_test['resume_skills'].apply(tokenize)
df_test['jd_tokens'] = df_test['job_desc'].apply(tokenize)

In [None]:
# Combine all tokens from resumes and job descriptions for training the Word2Vec model
all_tokens = df_train['resume_tokens'].tolist() + df_train['jd_tokens'].tolist()

# Train a Word2Vec model
model = Word2Vec(sentences=all_tokens, vector_size=100, window=7, min_count=1, workers=4)

# Function to get the average Word2Vec vector for a text
def get_avg_word2vec(text, model):
    vectors = [model.wv[word] for word in text if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(vectors, axis=0)

In [None]:
# Transform the text data into feature vectors
X_train_resume = np.array([get_avg_word2vec(text, model) for text in df_train['resume_tokens']])
X_train_jd = np.array([get_avg_word2vec(text, model) for text in df_train['jd_tokens']])

X_test_resume = np.array([get_avg_word2vec(text, model) for text in df_test['resume_tokens']])
X_test_jd = np.array([get_avg_word2vec(text, model) for text in df_test['jd_tokens']])

# Combine the features
X_train_combined = np.hstack((X_train_resume, X_train_jd))
X_test_combined = np.hstack((X_test_resume, X_test_jd))

y_train = df_train['label']
y_test = df_test['label']

In [None]:
X_train_combined.shape

(360, 200)

In [None]:
# Train an SVM classifier
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_combined, y_train)

# Make predictions
y_pred = svm_model.predict(X_test_combined)

# Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.74      0.84        35
           1       0.31      0.80      0.44         5

    accuracy                           0.75        40
   macro avg       0.64      0.77      0.64        40
weighted avg       0.88      0.75      0.79        40

