In [None]:
import joblib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from time import time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc, roc_auc_score, silhouette_score, confusion_matrix

In [None]:
#@title Loading data
!curl -O https://raw.githubusercontent.com/rromer07/kickstarter-status-prediction/main/kickstarter_data_sampled_final.csv
resampled_df = pd.read_csv('kickstarter_data_sampled_final.csv',header=0)

# drop the ids and urls columns
resampled_df = resampled_df.drop(columns=['id_new', 'urls'])

# drop the 'garbage' rows
resampled_df = resampled_df.drop([0, 2340, 2341, 2342, 2466, 2467])

# fix the dtypes
dtypes = {'goal': float, 'disable_communication': bool, 'staff_pick': bool, 'backers_count': int, 'deadline_month': int, 'deadline_day': int, 'deadline_hr': int, 'state_changed_at_month': int, 'state_changed_at_day': int, 'state_changed_at_hr': int, 'created_at_month': int, 'created_at_day': int, 'created_at_hr': int, 'launched_at_month': int, 'launched_at_day': int, 'launched_at_hr': int, 'create_to_launch_days': int, 'launch_to_deadline_days': int, 'launch_to_state_change_days': int, 'SuccessfulBool': int, 'USorGB': int, 'TOPCOUNTRY': int, 'LaunchedTuesday': int, 'DeadlineWeekend': int}
resampled_df = resampled_df.astype(dtypes)

# combining textual columns
resampled_df['combined_text_data'] = resampled_df['additional description'].fillna('') + ' ' + \
                               resampled_df['slug'].fillna('') + ' ' + \
                               resampled_df['blurb'].fillna('') + ' ' + \
                               resampled_df['name'].fillna('')

# Initializing TFIDF 
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, # ignore terms that appear in more than 90% of the documents
                                   min_df=5,    # ignore terms that are not present in at least 5 documents
                                   #max_features=n_features, 
                                   encoding='utf-8', 
                                   stop_words='english', 
                                   strip_accents='unicode')

# Learn vocabulary and idf, returns document-term matrix
t0 = time()
X_desc = tfidf_vectorizer.fit_transform(resampled_df['combined_text_data'].values.astype('U'))
print("vectorization done in %0.3fs. " % (time() - t0))
print(f"n_samples: {X_desc.shape[0]}, n_features: {X_desc.shape[1]}")
print(f"Around {X_desc.nnz / np.prod(X_desc.shape):.3f}% of the entries in X_desc matrix are non-zero.\n")  # sparsity of the X_desc matrix as a fraction of non-zero entries divided by the total number of elements

# dataframe with only numerical dtypes
resampled_numerical = resampled_df.select_dtypes(include=['int64','float64'])

# target variable 
y = resampled_df['SuccessfulBool']

# train/test split for numerical data
X_numerical = resampled_numerical.drop(['SuccessfulBool'], axis=1)
X_train_numerical, X_test_numerical, y_train_numerical, y_test_numerical = train_test_split(X_numerical,y,test_size=0.2,random_state=0)
X_train_numerical.shape, X_test_numerical.shape, y_train_numerical.shape, y_test_numerical.shape

# train/test split for desecription data
X_train_desc, X_test_desc, y_train_desc, y_test_desc = train_test_split(X_desc,y,test_size=0.2,random_state=0)
X_train_desc.shape, X_test_desc.shape, y_train_desc.shape, y_test_desc.shape

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 9812k  100 9812k    0     0  46.0M      0 --:--:-- --:--:-- --:--:-- 46.0M
vectorization done in 2.435s. 
n_samples: 2495, n_features: 10500
Around 0.016% of the entries in X_desc matrix are non-zero.



((1996, 10500), (499, 10500), (1996,), (499,))

In [None]:
#@title Loading numerical model
from google.colab import files
uploaded = files.upload()

numerical_model = joblib.load('numerical.joblib')

In [None]:
# @title Loading numerical/categorical model
# numerical_categorical_model = joblib.load('numerical')

In [None]:
#@title Loading textual model
# textual_ensemble = joblib.load('textual_ensemble.joblib')
textual_best_model = joblib.load('textual.joblib')

# Inference

In [None]:
# Predict class for both models
y_pred_numerical = numerical_model.predict(X_test_numerical)
y_pred_textual = textual_best_model.predict(X_test_desc)

# Get prediction probabilities from both models
y_pred_proba_numerical = numerical_model.predict_proba(X_test_numerical)
y_pred_proba_textual = textual_best_model.predict_proba(X_test_desc)

# combine the predicted probabilities using simple averaging
probs_combined = (y_pred_proba_numerical + y_pred_proba_textual)/2

# get the class predictions from the combined probabilities
y_pred = np.argmax(probs_combined, axis=1)

# calculate accuracy and classification report
y_true = y_test_numerical
accuracy = accuracy_score(y_true, y_pred)
report = classification_report(y_true, y_pred)

# print the results
print(f'Accuracy: {accuracy}')
print(f'Classification report:\n{report}')

Accuracy: 0.9098196392785571
Classification report:
              precision    recall  f1-score   support

           0       0.91      0.90      0.91       240
           1       0.91      0.92      0.91       259

    accuracy                           0.91       499
   macro avg       0.91      0.91      0.91       499
weighted avg       0.91      0.91      0.91       499

