In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_curve,
    roc_auc_score,
)
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_curve
from sklearn.feature_selection import RFE

In [7]:
df = pd.read_csv('/content/kickstarter_final_data.csv', index_col=0, low_memory=False)

In [8]:
cols_to_drop = [
    'name',
    'blurb',
    'category',
    'category_grouped',
    'country',
    'country_grouped',
    'currency',
    'currency_grouped',
    'deadline_weekday',
    'state_changed_at_weekday',
    'created_at_weekday',
    'launched_at_weekday'
]

existing = [c for c in cols_to_drop if c in df.columns]

df.drop(columns=existing, inplace=True)

In [9]:
features_to_drop = ['usd_pledged', 'state_changed_at', 'staff_pick', 'backers_count', 'spotlight', 'launch_to_state_change_days', 'state_changed_at_weekday_Friday', 'state_changed_at_weekday_Monday', 'state_changed_at_weekday_Saturday', 'state_changed_at_weekday_Sunday', 'state_changed_at_weekday_Thursday', 'state_changed_at_weekday_Tuesday', 'state_changed_at_weekday_Wednesday']
print(df.columns.to_list())

def safe_drop_columns(df, features_to_drop):
    for column in features_to_drop:
        if column in df.columns.tolist():
            df.drop(column, axis=1, inplace=True)
safe_drop_columns(df, features_to_drop)

['goal', 'disable_communication', 'deadline', 'state_changed_at', 'created_at', 'launched_at', 'staff_pick', 'backers_count', 'static_usd_rate', 'usd_pledged', 'spotlight', 'name_len_clean', 'blurb_len_clean', 'deadline_month', 'deadline_day', 'deadline_hr', 'created_at_month', 'created_at_day', 'created_at_hr', 'launched_at_month', 'launched_at_day', 'launched_at_yr', 'launched_at_hr', 'create_to_launch_days', 'launch_to_deadline_days', 'launch_to_state_change_days', 'SuccessfulBool', 'TOPCOUNTRY', 'LaunchedTuesday', 'DeadlineWeekend', 'app', 'help', 'new', 'world', 'canceled', 'country_grouped_CA', 'country_grouped_GB', 'country_grouped_Other', 'country_grouped_US', 'currency_grouped_CAD', 'currency_grouped_EUR', 'currency_grouped_GBP', 'currency_grouped_Other', 'currency_grouped_USD', 'deadline_weekday_Friday', 'deadline_weekday_Monday', 'deadline_weekday_Saturday', 'deadline_weekday_Sunday', 'deadline_weekday_Thursday', 'deadline_weekday_Tuesday', 'deadline_weekday_Wednesday', 'sta

In [13]:
num_cols = ['goal', 'static_usd_rate', 'name_len_clean', 'blurb_len_clean',
            'deadline_month', 'deadline_day', 'deadline_hr',
            'created_at_month', 'created_at_day', 'created_at_hr',
            'launched_at_month', 'launched_at_day', 'launched_at_yr', 'launched_at_hr',
            'create_to_launch_days', 'launch_to_deadline_days']

X = df.drop(columns=['SuccessfulBool', 'deadline', 'created_at', 'launched_at'])
y = df['SuccessfulBool']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

rfe_model = LogisticRegression(max_iter=1000, solver='liblinear')
rfe = RFE(estimator=rfe_model, n_features_to_select=10, step=1)
rfe.fit(X_train, y_train)

selected_features = X_train.columns[rfe.support_]
print("Top 10 selected features from RFE:")
print(selected_features)

grid_model = LogisticRegression(max_iter=1000)
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced']
}

grid = GridSearchCV(grid_model, param_grid, scoring='roc_auc', cv=5, n_jobs=-1, verbose=1)
grid.fit(X_train[selected_features], y_train)

print("\nBest parameters from GridSearchCV:")
print(grid.best_params_)

print("\nTest set score with best model:")
print(grid.score(X_test[selected_features], y_test))


Top 10 selected features from RFE:
Index(['goal', 'disable_communication', 'TOPCOUNTRY', 'app', 'new', 'world',
       'canceled', 'category_grouped_Plays', 'category_grouped_Software',
       'category_grouped_Web'],
      dtype='object')
Fitting 5 folds for each of 40 candidates, totalling 200 fits

Best parameters from GridSearchCV:
{'C': 100, 'class_weight': None, 'penalty': 'l1', 'solver': 'liblinear'}

Test set score with best model:
0.7775551161995083


In [24]:
df_final = pd.read_csv('/content/final_inconclusive_eliminated.csv', index_col=0, low_memory=False)
df_final.reset_index(inplace=True)

In [25]:
df_final.head()

Unnamed: 0,goal,static_usd_rate,name_len_clean,blurb_len_clean,deadline_month,deadline_day,deadline_hr,created_at_month,created_at_day,created_at_hr,...,category_grouped_Apps,category_grouped_Festivals,category_grouped_Gadgets,category_grouped_Hardware,category_grouped_Musical,category_grouped_Other,category_grouped_Plays,category_grouped_Software,category_grouped_Wearables,category_grouped_Web
0,-0.062759,-0.187647,1,0,-1.718418,0.807799,-0.488252,1.36854,1.534993,1.569253,...,0,0,0,0,0,1,0,0,0,0
1,-0.063488,-0.187647,0,0,-0.536059,-1.621762,0.502382,-1.37088,0.509481,-0.627839,...,0,0,0,0,0,1,0,0,0,0
2,0.009024,-0.187647,1,0,-1.127239,1.139103,-0.818463,-1.67526,0.965264,-2.148903,...,0,0,0,0,0,1,0,0,0,0
3,-0.060208,-0.187647,0,0,0.941889,-1.069589,-2.139309,0.75978,-1.199705,1.569253,...,0,0,0,0,0,1,0,0,0,0
4,-0.061504,0.3129,0,1,-0.24047,1.249538,-0.158041,-0.45774,1.07921,0.217197,...,0,0,0,0,0,1,0,0,0,0


In [26]:
df_final.columns

Index(['goal', 'static_usd_rate', 'name_len_clean', 'blurb_len_clean',
       'deadline_month', 'deadline_day', 'deadline_hr', 'created_at_month',
       'created_at_day', 'created_at_hr', 'launched_at_month',
       'launched_at_day', 'launched_at_yr', 'launched_at_hr',
       'create_to_launch_days', 'launch_to_deadline_days',
       'launch_to_state_change_days', 'TOPCOUNTRY', 'LaunchedTuesday',
       'DeadlineWeekend', 'name', 'blurb', 'disable_communication', 'country',
       'currency', 'deadline', 'state_changed_at', 'created_at', 'launched_at',
       'category', 'SuccessfulBool', 'app', 'help', 'new', 'world',
       'country_grouped_CA', 'country_grouped_GB', 'country_grouped_Other',
       'country_grouped_US', 'currency_grouped_CAD', 'currency_grouped_EUR',
       'currency_grouped_GBP', 'currency_grouped_Other',
       'currency_grouped_USD', 'deadline_weekday_Friday',
       'deadline_weekday_Monday', 'deadline_weekday_Saturday',
       'deadline_weekday_Sunday', 'deadl

In [28]:
# Dropping columns before training
cols_to_drop = [
    'name',
    'blurb',
    'category',
    'country',
    'currency',
    'deadline',
    'state_changed_at',
    'created_at',
    'launched_at',
    'launch_to_state_change',
    'create_to_launch',
    'launch_to_deadline',
    'currency_grouped',
    'category_grouped',
    'country_grouped',
    'spotlight',
    'usd_pledged',
    'staff_pick',
    'backers_count'
]

def safe_drop_columns(df, features_to_drop):
    for column in features_to_drop:
        if column in df.columns.tolist():
            df.drop(column, axis=1, inplace=True)
safe_drop_columns(df_final, cols_to_drop)
print(df_final.head())

       goal  static_usd_rate  name_len_clean  blurb_len_clean  deadline_month  \
0 -0.062759        -0.187647               1                0       -1.718418   
1 -0.063488        -0.187647               0                0       -0.536059   
2  0.009024        -0.187647               1                0       -1.127239   
3 -0.060208        -0.187647               0                0        0.941889   
4 -0.061504         0.312900               0                1       -0.240470   

   deadline_day  deadline_hr  created_at_month  created_at_day  created_at_hr  \
0      0.807799    -0.488252           1.36854        1.534993       1.569253   
1     -1.621762     0.502382          -1.37088        0.509481      -0.627839   
2      1.139103    -0.818463          -1.67526        0.965264      -2.148903   
3     -1.069589    -2.139309           0.75978       -1.199705       1.569253   
4      1.249538    -0.158041          -0.45774        1.079210       0.217197   

   ...  category_grouped_A

In [30]:

num_cols = ['goal', 'static_usd_rate', 'name_len_clean', 'blurb_len_clean',
            'deadline_month', 'deadline_day', 'deadline_hr',
            'created_at_month', 'created_at_day', 'created_at_hr',
            'launched_at_month', 'launched_at_day', 'launched_at_yr', 'launched_at_hr',
            'create_to_launch_days', 'launch_to_deadline_days']


X = df_final.drop(columns=['SuccessfulBool'])
y = df_final['SuccessfulBool']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

rfe_model = LogisticRegression(max_iter=1000, solver='liblinear')
rfe = RFE(estimator=rfe_model, n_features_to_select=10, step=1)
rfe.fit(X_train, y_train)


selected_features = X_train.columns[rfe.support_]
print("Top 10 selected features from RFE:")
print(selected_features)

grid_model = LogisticRegression(max_iter=1000)
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced']
}

grid = GridSearchCV(grid_model, param_grid, scoring='roc_auc', cv=5, n_jobs=-1, verbose=1)
grid.fit(X_train[selected_features], y_train)

print("\nBest parameters from GridSearchCV:")
print(grid.best_params_)

print("\nTest set score with best model:")
print(grid.score(X_test[selected_features], y_test))


Top 10 selected features from RFE:
Index(['goal', 'new', 'world', 'deadline_weekday_Wednesday',
       'category_grouped_Apps', 'category_grouped_Festivals',
       'category_grouped_Musical', 'category_grouped_Plays',
       'category_grouped_Software', 'category_grouped_Web'],
      dtype='object')
Fitting 5 folds for each of 40 candidates, totalling 200 fits

Best parameters from GridSearchCV:
{'C': 10, 'class_weight': None, 'penalty': 'l1', 'solver': 'liblinear'}

Test set score with best model:
0.7388812368757668


In [31]:

print("\nBest parameters from GridSearchCV:")
print(grid.best_params_)

print("\nTest set ROC-AUC score with best model:")
print(grid.score(X_test[selected_features], y_test))

best_index = grid.best_index_
mean_cv_score = grid.cv_results_['mean_test_score'][best_index]
std_cv_score = grid.cv_results_['std_test_score'][best_index]

print(f"\nBest mean cross-validated AUC: {mean_cv_score:.4f} ± {std_cv_score:.4f}")



Best parameters from GridSearchCV:
{'C': 10, 'class_weight': None, 'penalty': 'l1', 'solver': 'liblinear'}

Test set ROC-AUC score with best model:
0.7388812368757668

Best mean cross-validated AUC: 0.7423 ± 0.0060
