In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# Download necessary nltk resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Sample CVs and their corresponding personality labels
cv_data = [
    ("I am a highly motivated individual with a passion for learning and exploring new ideas. I have strong communication and teamwork skills.", "Conscientiousness"),
    ("I love meeting new people and enjoy working in collaborative environments. I am always ready to take on new challenges and thrive under pressure.", "Extraversion"),
    ("I am a creative thinker with a knack for problem-solving. I enjoy thinking outside the box and experimenting with new approaches.", "Openness"),
    ("I am empathetic and always try to see things from others' perspectives. I believe in fostering positive relationships and creating harmony in any situation.", "Agreeableness"),
    ("I tend to worry a lot and can be quite sensitive to criticism. However, I am working on managing my emotions better and focusing on the positive aspects of life.", "Neuroticism")
]

# Preprocessing function
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum()]
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    return ' '.join(tokens)

# Preprocess CV data
processed_data = [(preprocess_text(cv), label) for cv, label in cv_data]

# Extract features using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform([cv for cv, label in processed_data])
y = [label for cv, label in processed_data]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear SVC model
clf = LinearSVC()
clf.fit(X_train, y_train)

# Predict personality traits for test data
y_pred = clf.predict(X_test)

# Evaluate model performance
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


              precision    recall  f1-score   support

Extraversion       0.00      0.00      0.00       1.0
    Openness       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
