In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from ActivePyTools.grab_data import eval_object_columns
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd
import chardet

In [None]:
def grab_df_data(df_path):
    with open(df_path, 'rb') as file:
        encoding = chardet.detect(file.read())['encoding']

    temp_df = pd.read_csv(df_path, encoding=encoding)
    df = eval_object_columns(temp_df)
    return df

norm_df = grab_df_data('./data/combined_norm_df.csv')
norm_df['label'] = norm_df['label'].astype(int)
text_vec = grab_df_data('./data/text_vector.csv')

In [None]:
X = norm_df.drop(columns=['label'])
y = norm_df['label']

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
scores = []
precisions, recalls,specificities = [], [], []

for n in np.arange(10, 1000, 10):
    clf = RandomForestClassifier(n_estimators=n)  # max_iter may be set higher if convergence issues occur

    clf.fit(X_resampled, y_resampled)
    score = cross_val_score(clf, X, y, cv=5)
    scores.append(score.mean())

    y_pred = clf.predict(X_test)
    confusion_matrix(y_test, y_pred)

    cm1 = confusion_matrix(y_test, y_pred)
    precision1 = precision_score(y_test, y_pred)
    recall1 = recall_score(y_test, y_pred)
    TN1 = cm1[0, 0]
    FP1 = cm1[0, 1]
    specificity1 = TN1 / (TN1 + FP1)
    precisions.append(precision1)
    recalls.append(recall1)
    specificities.append(specificity1)

In [None]:
import matplotlib.pyplot as plt

rang = np.arange(10, 1000, 10)

# Plot the data
plt.figure(figsize=(10, 6))  # Set the figure size
plt.plot(rang, scores, label='RF Scores')
plt.plot(rang, precisions, label='precisions')
plt.plot(rang, recalls, label='recalls')
plt.plot(rang, specificities, label='specificities')

# Adding title and labels
plt.title('Line Plot of scores')
plt.xlabel('X values (10 to 300')
plt.ylabel('Y values')

# Add a legend
plt.legend()

# Show the plot
plt.grid(True)  # Optional: Add grid for better readability
plt.show()