In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import numpy as np

In [2]:
# Import original dataset
df_import = pd.read_excel("Kickstarter.xlsx")

# Drop occurrences with irrelevant target states
df_filtered = df_import[ (df_import.state == "failed") | (df_import.state == "successful") ]
df_filtered.state.value_counts()

failed        10299
successful     5386
Name: state, dtype: int64

In [None]:
# (TEST) Fill NaN in category
holes = {"category": "None"}
df_filtered = df_filtered.fillna(value=holes)

In [3]:
# Check for correlated predictors
correlations = df_import.corr(method='pearson')
#correlations.to_csv("indiv_proj_corr.csv")

Usd_pledged | pledged - 0.9539339
backers_count | pledged - 0.72921844
USD_pledged | backers_count - 0.7602639


Usd_pledged | pledged - 0.9539339
backers_count | pledged - 0.72921844
USD_pledged | backers_count - 0.7602639
Staff_picked | spotlight - 0.34722496

In [4]:
# Check for unary variables
with open("unary_test.txt", mode = 'w', encoding= "utf8") as filewriter:
    for col in df_filtered:
        values = df_filtered[col].value_counts()
        filewriter.write(f"\nOccurrences of each unique value in column {col} :\n {values}")
        filewriter.write("\n\n")
    filewriter.close()
    
# disable_communication is unary

Exclude variables:
project_id/name - irrelevant, too specific
pledged - correlated with usd_pledged, a better option due to standard measure
disable_communication : unary
currency - irrelevant, same reason as pledged
all deadline vars - launch_to_deadline_days is a better predictor 
all state_changed vars - seems irrelevant as project's status can be changed at anytime by owners or kickstarter for whatever reason
all created_at - create_to_launch_days is a better predictor
all launched_at - create_to_launch_days is a better predictor
static_usd_rate - irrelevant, same reason as pledged
spotlight - only TRUE when project is successful. only FALSE when project is failed. Direct correlation with target. 
name_len/name_len_cleaned - both correlated, doens't seem meaningful to keep
blurb_len/blurb_len_cleaned - both correlated, latter MIGHT be useful to keep


Worth noting
staff_pick - when true, projects higher chance to succeed. when false, projects mixed chance at success. might be good variable
category - 1471 null values exist. but definitely worth keeping as a variable. replace with none for NaN values.
blurb_len_cleaned - length of blurb text in project description. Might be good?

In [None]:
# Standardizes a df with z-score and returns it
from sklearn.preprocessing import StandardScaler
def standardizer(in_df: pd.DataFrame) -> pd.DataFrame:
    standardizer = StandardScaler()
    return standardizer.fit_transform(in_df)

In [None]:
''' 
Creates and runs a RandomForestClassifier with cv to test a provided X and y.

Params:
    X               - Predictors 
    y               - Targets
    num_trees       - Number of trees to grow. 500 if unspecified
    param_tuning    - Optional. If provided, runs GridSearchCV on the provided params

Returns: If param_tuning specified, GridSearchCV object. Otherwise, numpy ndarray from cross_val_score
'''
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
def random_forest(X: pd.DataFrame, y: pd.DataFrame, num_trees: int = 500, param_tuning: dict = None):
    # Can vary n_estimators
    randomForest = RandomForestClassifier(random_state=13, n_estimators=num_trees, bootstrap=True, oob_score=True)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=13)

    if param_tuning == None:
        scores = cross_val_score(randomForest, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
        return scores, randomForest
    else:
        grid_search = GridSearchCV(estimator=randomForest, scoring='accuracy', param_grid=param_tuning, cv=cv, n_jobs=-1)
        grid_result = grid_search.fit(X, y)
        return grid_result, randomForest