In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import datetime as dt
import numpy as np
from sklearn.model_selection import GridSearchCV


## Loading data and making final adjustments

In [2]:
final_df = pd.read_csv('final_dataset3.csv')

In [None]:
final_df

In [3]:
# Columns to drop
columns_to_drop = ['permalink','name','status','num_of_rounds','funding_total_usd','funding_total_usd_missing']

# Dropping the columns
final_df.drop(columns=columns_to_drop, inplace=True)

In [4]:
final_df.fillna(0,inplace=True)

In [5]:
# Apply one-hot encoding
final_df = pd.get_dummies(final_df, columns=['single_category','country_code','city'])


In [None]:
final_df.info()

### 80-20 split

In [None]:
# Define your feature columns and target column
X = final_df.drop(columns=['label'])  # drop the target column from the feature set
y = final_df['label']  # target column

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)

In [None]:
y_pred = rfc.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

In [None]:
pd.set_option('display.max_rows', None)
feature_importances = pd.DataFrame(rfc.feature_importances_, index=X_train.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, rfc.predict(X_test))
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, rfc.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve
precision, recall, _ = precision_recall_curve(y_test, rfc.predict_proba(X_test)[:, 1])

plt.figure()
plt.plot(recall, precision, color='blue', lw=2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()


In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(rfc, X, y, cv=5, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5), verbose=0)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.legend(loc="best")
plt.show()


### 60-20-20 split (train-validate-test)

In [6]:
# Define your feature columns and target column
X1 = final_df.drop(columns=['label'])  # drop the target column from the feature set
y1 = final_df['label']  # target column

# First split: 80% for the combined training and validation set, 20% for the test set
X_train_val1, X_test1, y_train_val1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)

# Second split: 75% of the combined set for training, 25% of the combined set for validation
# This results in 60% of the total data for training and 20% for validation
X_train1, X_val1, y_train1, y_val1 = train_test_split(X_train_val1, y_train_val1, test_size=0.25, random_state=42)

In [7]:
# Define a smaller grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}


In [8]:
# Option 1: GridSearchCV with a smaller grid
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    scoring='accuracy',
    verbose=2
)

# Fit GridSearchCV
grid_search.fit(X_train1, y_train1)



# Extract the best estimator
#best_rfc = grid_search.best_estimator_


Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [11]:
# Extract the best estimator
best_rfc = grid_search.best_estimator_

# Predictions and evaluation
y_pred1 = best_rfc.predict(X_test1)
accuracy = accuracy_score(y_test1, y_pred1)
report = classification_report(y_test1, y_pred1)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.8761471899309501
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      8588
           1       0.79      0.68      0.73      2853

    accuracy                           0.88     11441
   macro avg       0.85      0.81      0.83     11441
weighted avg       0.87      0.88      0.87     11441



In [None]:
rfc1 = RandomForestClassifier(random_state=42)
rfc1.fit(X_train1, y_train1)

In [None]:
y_pred1 = rfc1.predict(X_test1)


accuracy1 = accuracy_score(y_test1, y_pred1)
report1 = classification_report(y_test1, y_pred1)

print(f"Accuracy: {accuracy1}")
print("Classification Report:")
print(report1)

In [13]:
pd.set_option('display.max_rows', None)
feature_importances1 = pd.DataFrame(best_rfc.feature_importances_, index=X_train1.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances1)

                                         importance
avg_time_funding                       1.987148e-01
avg_time_funding_missing               1.229488e-01
first_funding_at                       9.930132e-02
last_funding_at                        7.498489e-02
operating_time                         6.520996e-02
founded_at                             5.506106e-02
founded_at_missing                     1.589770e-02
country_code_USA                       1.194487e-02
single_category_other                  7.579334e-03
single_category_software               6.748689e-03
city_San Francisco                     6.263402e-03
single_category_mobile                 4.951399e-03
single_category_biotechnology          4.792046e-03
single_category_web                    4.582844e-03
single_category_0                      4.295034e-03
city_New York                          4.045336e-03
single_category_media                  3.918420e-03
single_category_health                 3.816423e-03
single_categ

# XGBoost

In [7]:
import xgboost as xgb

In [8]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.1, 0.2]
}


In [9]:
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=2, n_jobs=-1, scoring='accuracy', verbose=2)


In [10]:
grid_search.fit(X_train1, y_train1)


Fitting 2 folds for each of 8 candidates, totalling 16 fits


[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time= 2.2min
[CV] END ...learning_rate=0.2, max_depth=3, n_estimators=100; total time= 1.5min
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=100; total time= 2.2min
[CV] END ...learning_rate=0.2, max_depth=3, n_estimators=100; total time= 1.6min
[CV] END ...learning_rate=0.1, max_depth=6, n_estimators=100; total time= 2.4min
[CV] END ...learning_rate=0.2, max_depth=3, n_estimators=200; total time= 1.7min
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=200; total time= 2.8min
[CV] END ...learning_rate=0.2, max_depth=6, n_estimators=100; total time= 1.5min
[CV] END ...learning_rate=0.1, max_depth=3, n_estimators=200; total time= 2.7min
[CV] END ...learning_rate=0.2, max_depth=6, n_estimators=100; total time= 1.5min
[CV] END ...learning_rate=0.1, max_depth=6, n_estimators=100; total time= 2.5min
[CV] END ...learning_rate=0.2, max_depth=3, n_estimators=200; total time= 1.8min
[CV] END ...learning_rate=0.

In [11]:
best_xgb = grid_search.best_estimator_
y_pred1 = best_xgb.predict(X_test1)

# Evaluate the model
accuracy = accuracy_score(y_test1, y_pred1)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test1, y_pred1))


Accuracy: 0.9083996154182327
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94      8588
           1       0.86      0.76      0.81      2853

    accuracy                           0.91     11441
   macro avg       0.89      0.86      0.87     11441
weighted avg       0.91      0.91      0.91     11441



In [13]:
pd.set_option('display.max_rows', None)
feature_importances1 = pd.DataFrame(best_xgb.feature_importances_, index=X_train1.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances1)

                                       importance
avg_time_funding                         0.108751
avg_time_funding_missing                 0.048724
operating_time                           0.041791
first_funding_at                         0.033886
last_funding_at                          0.030750
founded_at                               0.030147
country_code_USA                         0.020971
single_category_0                        0.014372
country_code_HKG                         0.012712
country_code_RUS                         0.010528
city_missing                             0.010375
country_code_missing                     0.009359
city_Mountain View                       0.008983
country_code_GBR                         0.008865
city_Woburn                              0.008808
single_category_analytics                0.008280
city_Palo Alto                           0.008246
country_code_CHN                         0.008192
city_Ann Arbor                           0.007520


### Experimenting with XGBoost thresholds to improve recall

In [14]:
# Predict probabilities
y_probs = best_xgb.predict_proba(X_test1)

# Get probabilities for class 1
y_probs_class_1 = y_probs[:, 1]


In [15]:
thresholds = np.arange(0.3, 0.51, 0.01)

In [17]:
from sklearn.metrics import recall_score

for thresh in thresholds:
    # Apply threshold
    y_pred_thresh = (y_probs_class_1 >= thresh).astype(int)

    # Calculate recall
    recall = recall_score(y_test1, y_pred_thresh, pos_label=1)
    print(f"Threshold: {thresh:.2f}, Recall: {recall:.4f}")


Threshold: 0.30, Recall: 0.8780
Threshold: 0.31, Recall: 0.8707
Threshold: 0.32, Recall: 0.8665
Threshold: 0.33, Recall: 0.8594
Threshold: 0.34, Recall: 0.8556
Threshold: 0.35, Recall: 0.8500
Threshold: 0.36, Recall: 0.8444
Threshold: 0.37, Recall: 0.8384
Threshold: 0.38, Recall: 0.8328
Threshold: 0.39, Recall: 0.8272
Threshold: 0.40, Recall: 0.8184
Threshold: 0.41, Recall: 0.8132
Threshold: 0.42, Recall: 0.8083
Threshold: 0.43, Recall: 0.8013
Threshold: 0.44, Recall: 0.7957
Threshold: 0.45, Recall: 0.7865
Threshold: 0.46, Recall: 0.7830
Threshold: 0.47, Recall: 0.7795
Threshold: 0.48, Recall: 0.7729
Threshold: 0.49, Recall: 0.7652
Threshold: 0.50, Recall: 0.7599


# Logistic regression

In [18]:
from sklearn.linear_model import LogisticRegression

In [22]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'solver': ['lbfgs', 'liblinear']  # Solver
}
grid_search2 = GridSearchCV(LogisticRegression(random_state=42), param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)


In [21]:
grid_search2.fit(X_train1, y_train1)

NameError: name 'grid_search2' is not defined