In [2]:
import pandas as pd
import numpy as np
import os

In [11]:
DATA_PATH = os.path.join(os.getcwd(), 'data', 'successful')
segmented_enriched_linkedin_profiles = pd.read_csv(os.path.join(DATA_PATH, 'success_enriched_linkedin_profiles.csv'))
successful_profiles = pd.read_csv(os.path.join(DATA_PATH, 'successful_profiles.csv'))
segmented_successful_profiles = pd.read_csv(os.path.join(DATA_PATH, 'segmented_successful_profiles_gpt-3.5-turbo_4.csv'))
long_company_descriptions = pd.read_csv(os.path.join(DATA_PATH, "Moneyball 1.1_ Success - Long company descriptions.csv"))

In [21]:
#segmented_enriched_linkedin_profiles.head()
print(long_company_descriptions.head()[:1])
print(segmented_successful_profiles['paragraph'][:1])

      org_name                              org_uuid  \
0  10X Banking  a2323b6c-29b9-4750-905c-cdcbd9ce92b8   

                                    long_description  
0  10x Banking (10x), a financial services techno...  
0    Ryan Johns is known for their contribution as ...
Name: paragraph, dtype: object


In [16]:
import pandas as pd
from gensim.models import Word2Vec, KeyedVectors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

In [15]:
!pip install gensim

Collecting gensim
  Obtaining dependency information for gensim from https://files.pythonhosted.org/packages/db/af/18b551ae8d26b8731dbe5923565fdf96502bb9aca88a37f241d510c62dc2/gensim-4.3.2-cp39-cp39-macosx_11_0_arm64.whl.metadata
  Downloading gensim-4.3.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (8.5 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Obtaining dependency information for smart-open>=1.8.1 from https://files.pythonhosted.org/packages/ad/08/dcd19850b79f72e3717c98b2088f8a24b549b29ce66849cd6b7f44679683/smart_open-7.0.1-py3-none-any.whl.metadata
  Downloading smart_open-7.0.1-py3-none-any.whl.metadata (23 kB)
Downloading gensim-4.3.2-cp39-cp39-macosx_11_0_arm64.whl (24.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading smart_open-7.0.1-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m2.3 MB/s[0m eta 

In [18]:
def load_data_and_pretrained_model(dataset_path, word2vec_model_path):
    # Load dataset
    df = pd.read_csv(dataset_path)
    # Load pre-trained Word2Vec model
    model = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)
    return df, model

def document_vector(doc, model):
    # Remove out-of-vocabulary words
    doc = [word for word in doc if word in model.key_to_index]
    if not doc:
        return np.zeros(model.vector_size)
    return np.mean(model[doc], axis=0)

def prepare_embeddings(df, model):
    # Tokenize the paragraphs
    X = df['paragraph'].apply(lambda x: x.split())  # Simple tokenization
    # Compute document vectors
    X = np.array([document_vector(doc, model) for doc in X])
    return X

def encode_labels(df):
    # Encode segment labels to integers
    le = LabelEncoder()
    y = le.fit_transform(df['segment'])
    return y, le

def train_knn_classifier(X, y):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Train KNN classifier
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)
    return knn

def predict_level(description, model, knn_classifier, label_encoder):
    # Convert input description to embeddings
    embedding = document_vector(description.split(), model)
    # Reshape for a single sample
    embedding = embedding.reshape(1, -1)
    # Predict
    prediction = knn_classifier.predict(embedding)
    # Decode label
    level = label_encoder.inverse_transform(prediction)[0]
    return level

In [ ]:
# Example usage
dataset_path = os.path.join(DATA_PATH, 'segmented_successful_profiles_gpt-3.5-turbo_4.csv')
word2vec_model_path = 'GoogleNews-vectors-negative300.bin'  # Path to Google's pre-trained Word2Vec model
df, w2v_model = load_data_and_pretrained_model(dataset_path, word2vec_model_path)
X = prepare_embeddings(df, w2v_model)
y, le = encode_labels(df)
knn_classifier = train_knn_classifier(X, y)

# Predict the level for a new founder description
new_description = "Your founder description here."
predicted_level = predict_level(new_description, w2v_model, knn_classifier, le)
print(f"Predicted level: {predicted_level}")