# Simple predictions of successful funding - logistic regression

### _Lukas Vlcek_

## 1. Introduction

Kickstarter records contain more than 200,000 projects, with information about the ultimate success or failure of their funding campaign, the type of the proposed work, country of origin, or amounts of money asked and pledged by funders.

## 2. Data setup

Notebook configuration

In [1]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

Create a dataframe from a pre-processed CSV file and filter out uninformative features

In [2]:
target_path = '../data/processed'
report_path = '../reports'
filename = 'kick_id.csv'
datecols = ['created_at', 'deadline', 'state_changed_at', 'launched_at']
fdatpars = lambda x: datetime.datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S')
df = pd.read_csv(os.path.join(target_path, filename), index_col='id', parse_dates=datecols, date_parser=fdatpars)

Filter out unneeded data and add some indicator features

In [3]:
df = df.drop(['disable_communication'], axis='columns')
df = df.loc[(df['state'] != 'live') & (df['state'] != 'suspended')]
cat_type = [x.split('/')[0] for x in df['category'].values]
df['cat_type'] = np.array(cat_type)
df['dummy'] = 1
df['period'] = (df['launched_at'] >= '2014-06-01') & (df['launched_at'] < '2018-01-01')
df['succeeded'] = np.int_(df['state'] == 'successful')
df['staff_pick'] = np.int_(df['staff_pick'])
df.sort_values('launched_at').loc[df.period]['launched_at'].shape

(149007,)

In [4]:
# counting words in project names and blurbs
df.loc[df['blurb'].isnull(), 'blurb'] = ''
df.loc[df['name'].isnull(), 'name'] = ''
df['blurb_wlen'] = df['blurb'].str.split().apply(len)
df['name_wlen'] = df['name'].str.split().apply(len)

In [5]:
# make new dataframes with new and old data
dfn = df.loc[df.period].copy()
dfo = df.loc[~df.period].copy()
dfn.shape, dfo.shape

((149007, 23), (89449, 23))

In [6]:
dfn['goal_log'] = np.log10(dfn['goal'].values)

In [7]:
dfm = dfn.drop(['usd_pledged','goal','state','slug','currency','deadline','state_changed_at','created_at','backers_count','spotlight','period'], axis=1).copy()

In [8]:
dfm.head()

Unnamed: 0_level_0,name,pledged,country,launched_at,staff_pick,blurb,category,cat_type,dummy,succeeded,blurb_wlen,name_wlen,goal_log
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
18520,Grandma's are Life,62.0,US,2016-10-19 09:32:40,0,Raising money to help my grandmother recover f...,music/world music,music,1,0,24,3,4.176091
21109,Meta,173.0,GB,2015-04-07 18:37:44,0,My work is performance based but I branch out ...,art/performance art,art,1,1,24,1,2.176091
24380,Puss N' Books: A relaxing cat cafe and bookstore.,776.0,US,2015-10-27 11:25:33,0,A sanctuary for humans and felines alike! Come...,food/spaces,food,1,0,24,9,4.30103
33867,TASTE MAKERS BY TRISH P,2798.0,CA,2015-06-15 14:28:11,1,Taste Makers is a socially conscious brand tha...,fashion/ready-to-wear,fashion,1,0,23,5,4.255273
39036,The Meat Candy Experience,3239.0,US,2016-05-16 18:34:18,0,"The BEST beef sticks, beef jerky and signature...",food/small batch,food,1,1,13,4,3.39794


## 3. Modeling

In [9]:
# get dummies
dfd = pd.get_dummies(dfm, columns=['country','cat_type'])

In [22]:
# ML imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_curve, roc_auc_score

### 3.1 Training and testing data preparation

In [11]:
df_train=dfd.sample(frac=0.8,random_state=200)
df_test=dfd.drop(df_train.index)

In [12]:
print(df_train.shape, df_test.shape, dfd.shape)

(119206, 48) (29801, 48) (149007, 48)


### 3.2 Train Naive Bayes on name and blurb data

In [13]:
vectorizer = CountVectorizer(min_df=2, max_df=2000)

# project name
Xn = vectorizer.fit_transform(df_train.name).tocsc()
#fn_names = vectorizer.get_feature_names()

# project blurb
dfn.blurb.fillna('', inplace=True)
Xb = vectorizer.fit_transform(df_train.blurb).tocsc()
#fb_names = vectorizer.get_feature_names()

y = df_train.succeeded.values.astype(np.int)

print(Xn.shape, Xb.shape, y.shape)#, len(fn_names), len(fb_names))

(119206, 24661) (119206, 35717) (119206,)


In [30]:
#the grid of parameters to search over
param_grid = {'alpha':np.logspace(0.1, 100, 10)}
              
#mnb = BernoulliNB()
mnb = MultinomialNB()

mnb_cv = GridSearchCV(mnb, param_grid, cv=5, verbose=1)

# train and predict names
mnb_cv.fit(Xn, y)
pn = mnb_cv.predict_proba(Xn)[:,1]
print('Best alpha', mnb_cv.best_params_)
print('Best score', mnb_cv.best_score_)

# train and predict blurbs
mnb_cv.fit(Xb, y)
pb = mnb_cv.predict_proba(Xb)[:,1]
print('Best alpha', mnb_cv.best_params_)
print('Best score', mnb_cv.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.9s finished


Best alpha {'alpha': 1.2589254117941673}
Best score 0.670410885358
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best alpha {'alpha': 1.2589254117941673}
Best score 0.688857943392


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    3.4s finished


In [32]:
df_train['xn'] = np.log(pn/(1-pn))
df_train['xb'] = np.log(pb/(1-pb))

In [36]:
predictors = []
predictors.extend([c for c in dfd.columns if c[0:5] == 'cat_t' ])
predictors.extend([c for c in dfd.columns if c[0:5] == 'count' ])
#predictors.extend(['goal_log', 'staff_pick','name_wlen', 'xn', 'xb'])
predictors.extend(['goal_log', 'staff_pick','name_wlen'])

In [37]:
Xtrain = df_train[predictors]
ytrain = df_train.succeeded.values.astype(np.int)

#Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)
#print(X.shape, Xtrain.shape, Xtest.shape, y.shape, ytrain.shape, ytest.shape)

In [38]:
logreg = LogisticRegression(verbose=1, C=1e6)#, warm_start=True)

#print(logreg.intercept_, logreg.coef_)
logreg.fit(Xtrain, ytrain)
#logreg.intercept_ = 2.299
#logreg.coef_ = np.reshape(np.array([-0.6687]), (1,-1))
print(logreg.intercept_, logreg.coef_)

[LibLinear][ 1.67239501] [[ -1.54736975e-01   6.37432018e-01  -4.03178109e-01   8.45905186e-01
    1.66575366e+00  -3.16567970e-01   4.84352258e-01  -6.57644904e-01
    6.22206615e-01  -9.55272598e-01   3.05443063e-01  -4.54812311e-01
   -1.33105993e-01  -5.28125807e-01   7.14746879e-01  -7.80630645e-01
   -2.10838961e-01  -4.71180303e-01  -1.28053319e-01  -9.54382725e-03
   -5.07508715e-01   6.00849087e-01  -5.46777488e-01  -7.56966802e-04
   -9.82509122e-02   1.51149337e+00  -2.33190893e-01  -9.42242091e-01
    1.37968505e+00   7.39903953e-01   3.66693962e-01  -5.05469927e-01
    3.68768078e-01   7.66430786e-02   6.38788966e-01   4.40825746e-01
   -1.68122231e-02  -8.37906138e-01   2.54708466e+00   1.13502357e-01]]


In [39]:
y_pred = logreg.predict(Xtrain)
y_pred_prob = logreg.predict_proba(Xtrain)
print('Score:', logreg.score(Xtrain, ytrain))

Score: 0.724971897388


In [40]:
print('CV score:', cross_val_score(logreg, Xtrain, ytrain, cv=5))

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]CV score: [ 0.72758158  0.72183542  0.72266264  0.72668932  0.72508389]


In [41]:
y_pred = logreg.predict(Xtest)
y_pred_prob = logreg.predict_proba(Xtest)
print('Score:', logreg.score(Xtest, ytest))
#y_pred_cv = cross_val_predict(logreg, X, y, cv=3)
#y_pred_cv

NameError: name 'Xtest' is not defined

In [42]:
fpr, tpr, thresholds = roc_curve(ytest, y_pred_prob[:,1])
plt.plot(fpr, tpr)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title("ROC")
print('AUC', roc_auc_score(ytest, y_pred_prob[:,1]))

NameError: name 'ytest' is not defined

In [None]:
logregcv = LogisticRegressionCV(verbose=1, cv=3, Cs=[0.1, 1.0, 100.0])#, warm_start=True)
logregcv.fit(X, y)
print(logregcv.coef_)

In [None]:
y_pred = logregcv.predict(Xtest)
y_pred_prob = logregcv.predict_proba(Xtest)
print('Score:', logregcv.score(Xtest, ytest))
fpr, tpr, thresholds = roc_curve(ytest, y_pred_prob[:,1])
plt.plot(fpr, tpr)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title("ROC")
print('AUC', roc_auc_score(ytest, y_pred_prob[:,1]))

### 3.2 Naive Bayes for text analysis 

In [None]:
# Vectorize project names and blurbs
#vectorizer = TfidfVectorizer(min_df=1)
vectorizer = CountVectorizer(min_df=2, max_df=2000)

# project name
Xn = vectorizer.fit_transform(dfn.name).tocsc()
fn_names = vectorizer.get_feature_names()

# project blurb
dfn.blurb.fillna('', inplace=True)
Xb = vectorizer.fit_transform(dfn.blurb).tocsc()
fb_names = vectorizer.get_feature_names()

y = dfn.succeeded.values.astype(np.int)

print(Xn.shape, Xb.shape, y.shape, len(fn_names), len(fb_names))

Simple training without cross validation

In [None]:
# NB for names

# train-test split
Xntrain, Xntest, ytrain, ytest = train_test_split(Xn, y)

# Create instance of multinomial naive bayes
mnb_n = MultinomialNB()

# fit to training data
mnb_n.fit(Xntrain, ytrain)

print("Train set score:", mnb_n.score(Xntrain, ytrain))
print("Test set score:", mnb_n.score(Xntest, ytest))

In [None]:
# NB for blurbs

# train-test split
Xbtrain, Xbtest, ytrain, ytest = train_test_split(Xb, y)

# Create instance of multinomial naive bayes
mnb_b = MultinomialNB()

# fit to training data
mnb_b.fit(Xbtrain, ytrain)

print("Train set score:", mnb_b.score(Xbtrain, ytrain))
print("Test set score:", mnb_b.score(Xbtest, ytest))

In [None]:
#the grid of parameters to search over
param_grid = {'alpha':np.logspace(0.1, 100, 10)}
              
mnb = MultinomialNB()
mnb_cv = GridSearchCV(mnb, param_grid, cv=5, verbose=1)
mnb_cv.fit(Xn, y)

print('Best alpha', mnb_cv.best_params_)
print('Best score', mnb_cv.best_score_)

In [None]:
mnb_cv.predict_proba(Xn)[:,1]

In [None]:
pn = mnb_n.predict_proba(Xn)[:,1]
pb = mnb_b.predict_proba(Xb)[:,1]

In [None]:
xn = np.log(pn/(1-pn))
xb = np.log(pb/(1-pb))

In [None]:
print(xn, xb)

In [None]:
dfd['xn'] = xn

In [None]:
dfd['xb'] = xb

In [None]:
dfd.head()

In [None]:
predictors.extend(['xn', 'xb'])

In [None]:
Xx = dfd[predictors]
y = dfd.succeeded.values.astype(np.int)

Xtrain, Xtest, ytrain, ytest = train_test_split(Xx, y)
print(X.shape, Xtrain.shape, Xtest.shape, y.shape, ytrain.shape, ytest.shape)

In [None]:
logreg = LogisticRegression(verbose=1, C=0.1)#, warm_start=True)

logreg.fit(Xtrain, ytrain)
#logreg.intercept_ = 2.299
#logreg.coef_ = np.reshape(np.array([-0.6687]), (1,-1))
print(logreg.intercept_, logreg.coef_)

In [None]:
y_pred = logreg.predict(Xtest)
y_pred_prob = logreg.predict_proba(Xtest)
print('Score:', logreg.score(Xtest, ytest))

In [None]:
print('CV score:', cross_val_score(logreg, Xx, y, cv=5))

In [None]:
y_pred = logregcv.predict(Xtest)
y_pred_prob = logregcv.predict_proba(Xtest)
print('Score:', logregcv.score(Xtest, ytest))
fpr, tpr, thresholds = roc_curve(ytest, y_pred_prob[:,1])
plt.plot(fpr, tpr)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title("ROC")
print('AUC', roc_auc_score(ytest, y_pred_prob[:,1]))

Modify score function to replace accuracy with precision?

In [None]:
Interpretability, based on rigorous statistical principles, minimal number of parameters, predictions of microstructures
Well justified choice of optimization loss function base on rigorous statistical principles.

In [None]:
import findspark