In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import sklearn_pandas
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
#preprocessed_data = pd.read_csv("clean_investmentdata_for_ml.csv",encoding='ISO-8859-1')
filtered_data = pd.read_csv("filter_no_outliers.csv",encoding='ISO-8859-1')
print(filtered_data.describe())

In [None]:
# drop columns with a significant number of missing values and the 'homepage_url' column
filtered_data = filtered_data.drop(columns=['state_code', 'founded_at', 'founded_month', 'founded_quarter', 'founded_year', 'homepage_url'])

# replace missing values in categorical columns with 'Unknown'
categorical_columns_with_na = ['category_list', 'status', 'country_code', 'region', 'city']
for col in categorical_columns_with_na:
    filtered_data[col].fillna('Unknown', inplace=True)

# check if there are any missing values left
missing_values_cleaned = filtered_data.isnull().sum()

missing_values_cleaned


The next step is to handle the categorical variables. We will convert them into numerical form using one-hot encoding. However, before proceeding, we should drop columns that might not be useful for prediction, such as permalink and name, which are unique identifiers for the startups and unlikely to have predictive power.

In [7]:
# convert the columns to datetime format
filtered_data['first_funding_year'] = pd.to_datetime(filtered_data['first_funding_at'], errors='coerce').dt.year
filtered_data['last_funding_year'] = pd.to_datetime(filtered_data['last_funding_at'], errors='coerce').dt.year

# drop the columns that have already been converted to years
data_cleaned = filtered_data.drop(columns=['first_funding_at', 'last_funding_at'], errors='ignore')

# one-hot encode the cleaned dataset again
data_encoded = pd.get_dummies(data_cleaned, drop_first=True)

# display the shape and first few rows of the newly encoded data
data_encoded.shape, data_encoded.head()


((14298, 37394),
    funding_total_usd  funding_rounds     seed    venture  equity_crowdfunding  \
 0          4000000.0             2.0      0.0  4000000.0                  0.0   
 1            40000.0             1.0  40000.0        0.0                  0.0   
 2            60000.0             2.0      0.0        0.0              60000.0   
 3          7000000.0             1.0      0.0  7000000.0                  0.0   
 4          2000000.0             1.0      0.0  2000000.0                  0.0   
 
    undisclosed  convertible_note  debt_financing  angel  grant  ...  \
 0          0.0               0.0             0.0    0.0    0.0  ...   
 1          0.0               0.0             0.0    0.0    0.0  ...   
 2          0.0               0.0             0.0    0.0    0.0  ...   
 3          0.0               0.0             0.0    0.0    0.0  ...   
 4          0.0               0.0             0.0    0.0    0.0  ...   
 
    city_Zhuhai  city_Zug  city_Zwevegem  city_Zwijnaar

In [8]:
# drop 'permalink' and 'name' columns
filtered_data = filtered_data.drop(columns=['permalink', 'name'])

# one-hot encode categorical variables
filtered_data = pd.get_dummies(filtered_data, drop_first=True)

# display the shape and first few rows of the encoded data
filtered_data.shape, filtered_data.head()

((14298, 14121),
    funding_total_usd  funding_rounds     seed    venture  equity_crowdfunding  \
 0          4000000.0             2.0      0.0  4000000.0                  0.0   
 1            40000.0             1.0  40000.0        0.0                  0.0   
 2            60000.0             2.0      0.0        0.0              60000.0   
 3          7000000.0             1.0      0.0  7000000.0                  0.0   
 4          2000000.0             1.0      0.0  2000000.0                  0.0   
 
    undisclosed  convertible_note  debt_financing  angel  grant  ...  \
 0          0.0               0.0             0.0    0.0    0.0  ...   
 1          0.0               0.0             0.0    0.0    0.0  ...   
 2          0.0               0.0             0.0    0.0    0.0  ...   
 3          0.0               0.0             0.0    0.0    0.0  ...   
 4          0.0               0.0             0.0    0.0    0.0  ...   
 
    last_funding_at_2014-11-21  last_funding_at_2014-11

In [9]:
from sklearn.model_selection import train_test_split

# splitting the data into features and target variable
X = data_encoded.drop("success", axis=1)
y = data_encoded["success"]

# splitting the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

# drop rows with missing values from X_train and y_train
train_mask = X_train.isnull().sum(axis=1) == 0
X_train= X_train[train_mask]
y_train= y_train[train_mask]

# drop rows with missing values from X_test and y_test
test_mask = X_test.isnull().sum(axis=1) == 0
X_test = X_test[test_mask]
y_test = y_test[test_mask]

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((11438, 37393), (2859, 37393), (11438,), (2859,))

In [11]:

# initialize the Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# train the model on the training data
gb_classifier.fit(X_train, y_train)

# predict the outcomes for the test set
y_pred = gb_classifier.predict(X_test)

# calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

# generate a classification report
class_report = classification_report(y_test, y_pred)

# generate a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)



print(f"Accuracy: {accuracy*100:.2f}%")
print("\nClassification Report:\n", class_report)
print("\nConfusion Matrix:\n", conf_matrix)

Accuracy: 100.00%

Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        90
         1.0       1.00      1.00      1.00      2769

    accuracy                           1.00      2859
   macro avg       1.00      1.00      1.00      2859
weighted avg       1.00      1.00      1.00      2859


Confusion Matrix:
 [[  90    0]
 [   0 2769]]
