In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import openai
from dotenv import dotenv_values


# Set your OpenAI API key
config = dotenv_values(".env")
api_key = config["API_KEY"]

# Initialize OpenAI client with API key
client = openai.OpenAI(api_key=api_key)

# Function to embed a batch of texts using OpenAI's embedding model
def embed_text_batch(texts):
    response = client.embeddings.create(
        input=texts,  # The texts to be embedded
        model="text-embedding-3-small"  # The model used for embedding
    )
    return [item.embedding for item in response.data]  # Return the embeddings

# Load the new CSV file
file_path = 'bigNewWalletData.csv'
df = pd.read_csv(file_path)

# Randomize the order of the rows in the dataframe and select 50 random rows 
# <!--- Thomas: There is only 200 rows in the csv being passed in
df = df.sample(n=50, random_state=42).reset_index(drop=True) # <!--- Jonathan: I changed the value to 1000 to 50


# Prepare the text data and labels
X = df['Address'].tolist()
y = df['Label']

# Embed the text data
embeddings = embed_text_batch(X)

# Convert embeddings to a numpy array
X_embeddings = np.array(embeddings)

# Apply PCA to reduce the dimensionality of the embeddings to 10 components
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_embeddings)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Display the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Private", "Public"], yticklabels=["Private", "Public"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)

# Create a DataFrame for the test set with original columns
df_test = df.loc[y_test.index].copy()
df_test['ADDRESS'] = [list(X_pca[i]) for i in range(len(y_test))]
df_test['Predicted Label'] = y_pred

# Save the test results to a CSV
results_csv_path = 'nw_test_results.csv' # <!--- Thomas: The file name saved has to be a different name 
# then the file being passed in
df_test.to_csv(results_csv_path, index=False)

print(f"Test results saved to {results_csv_path}")

NameError: name 'config' is not defined

In [None]:
import joblib

# Assuming you have a trained model named 'model'
# Save the model to a file
joblib.dump(model, 'WalletClassifier.pkl')

In [10]:
#Check, run to load model again

# To load the model from the file
loaded_model = joblib.load('WalletClassifier.pkl')
loaded_model

In [19]:
loaded_model.predict_proba(X_test)

array([[0.37167275, 0.62832725],
       [0.34881526, 0.65118474],
       [0.38626617, 0.61373383],
       [0.62088547, 0.37911453],
       [0.60721039, 0.39278961],
       [0.63717296, 0.36282704],
       [0.38113952, 0.61886048],
       [0.36936055, 0.63063945],
       [0.35730239, 0.64269761],
       [0.396649  , 0.603351  ]])