In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
import pickle

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
# Read in the saved, labeled data
with open(fr"data/classifier_data/labelled_01.txt", encoding='utf8') as f:
    data = f.read()

In [None]:
labeled_data = [(textstr, int(label.strip())) for textstr, label in list(eval(data))]

In [None]:
X_vals = [textstr for textstr, label in list(eval(data))] #212 labelled paragraphs rn
y_vals = [int(label.strip()) for textstr, label in list(eval(data))]

In [None]:
print(f"About {round(y_vals.count(1)/len(y_vals),2)*100}% of our labeled data contain tech names, leaving about {round(y_vals.count(0)/len(y_vals),2)*100}% which do not.")

In [None]:
# Tokenize/preprocess text using spacy
processed = [" ".join([token.lemma_ for token in nlp(paragraph)]) for paragraph in X_vals]
X_train, X_test, y_train, y_test = train_test_split(processed, y_vals, test_size=0.2, random_state=10)

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000) #Small # features, shouldn't need to set max_features here I don't think?
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

clf = LogisticRegression()
clf.fit(X_train_tfidf, y_train)

In [None]:
acc = clf.score(X_test_tfidf, y_test)
print(acc)

In [None]:
y_pred = clf.predict(X_test_tfidf)

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
# save our model and vectorizer
# with open("job_desc_classifier_v1.0.pkl", "wb") as model_file:
#     pickle.dump(clf, model_file)
# with open("job_desc_tfidf_vectorizerv1.0.pkl", "wb") as vect_file:
#     pickle.dump(tfidf_vectorizer, vect_file)

In [None]:
# # To re-use the saved model:
# with open("job_desc_classifier_v1.0.pkl", "rb") as model_file:
#     clf = pickle.load(model_file)
# with open("job_desc_tfidf_vectorizerv1.0.pkl", "rb") as vect_file:
#     tfidf_vectorizer = pickle.load(vect_file)